###############################################################################
###############################################################################
## Create parameter list ##
###############################################################################
###############################################################################
## Set project directory
# projectDir <- gsub("scripts/bulkRNAseq_workflow/analyses/Main_Analysis", "", getwd())
projectDir <- paste0(unlist(strsplit(getwd(), "scripts"))[1])
designDir <- unlist(strsplit(getwd(), "analyses/"))[1]
pipelineList <- biologicSeqTools2::assembleBiologicProject(
## Path to the design file. Essential columns: sample.id, sample.group, dataseries. ##
## Ideal sample name: [dataseries]_[sample.group]_[replicate]
designFN = paste0(designDir, "design/design.table.txt"),
## Path to model table ##
modelFN = paste0(designDir, "design/model.table.txt"),
## Path to NFcore setting file. Set to NULL in no-alignment mode.
#NFcoreSettingsFN = "/camp/stp/babs/working/boeings/Projects/goulda/adrien.franchet/472A_brains_from_drosophila_larvae_RN21220/workdir/bulkRNAseq_workflow/design/RN21220test.NFcore.samplesheet.file.csv",
## Path to relevant FASTQ files.
#pathToSeqStorageFolder = c(
# "/camp/stp/babs/inputs/sequencing/data/goulda/adrien.franchet/RN21220/primary_data/211203_A01366_0104_AH3WVGDMXY/fastq/",
# "/camp/stp/babs/inputs/sequencing/data/goulda/adrien.franchet/RN21220/primary_data/211130_A01366_0101_BH3YWKDMXY/fastq/"
#),
## Path to RSEM count table. Essential.
countTableFN = paste0(projectDir, "data/rsem.count.txt"),
## Path to TPM table.
TpmTableFN = paste0(projectDir, "data/dfTPM.txt"),
biologicSettingsFN = paste0(designDir, "design/biologic.settings.file.csv"),
PcaFN = NULL,
#"/camp/stp/babs/working/boeings/Projects/goulda/adrien.franchet/472A_brains_from_drosophila_larvae_RN21220/workdir/data/dfPca.txt",
calculate_DGE = TRUE,
calculate_LRT = TRUE,
## Path to external DEseq2 output files
DEseq2External_DGE = NULL,
#"/camp/stp/babs/working/boeings/Projects/goulda/adrien.franchet/472A_brains_from_drosophila_larvae_RN21220/workdir/data/DEseq2External_DGE/",
DEseq2External_LRT = NULL,
#"/camp/stp/babs/working/boeings/Projects/goulda/adrien.franchet/472A_brains_from_drosophila_larvae_RN21220/workdir/data/DEseq2External_LRT/",
stranded = TRUE,
read.length = "100bp",
paired.end = TRUE,
#pathToRSEMresultsFiles = paste0("/camp/stp/babs/working/boeings/Projects/goulda/adrien.franchet/472_brains_from_drosophila_larvae_RN21220/workdir/", "RSEM/Ensembl/"),
projectFolder = projectDir,
experiment_id = "bulkliverdemo",
project_name = "Liver cancer pseudo bulk RNA-seq demo from Kang et al",
lims.id = "bulkliverdemo",
labname = "Park Lab",
NtopGenes = 500,
experiment.type = "bulk_rna_seq",
species = "homo_sapiens",
release = "release-105",
count.table.headline = "Expression Intensities for all Samples",
count.table.sidelabel = "Expr",
heatmap.headline.text = "Heatmap: Row-averaged Expr",
designTScol = NULL,
timecourse.units = NULL,
primDataDB = "demogtex_data",
db.user = "babs",
host = "clvd1-db-u-p-31.thecrick.org",
lab.categories.table = "demogtex_lab_categories",
corGeneVec = c("EZH2")
#corGeneVec = NULL
)
chnkPrefix <- "set.parameters."
VersionPdfExt <- VersionPdfExt <- paste0(".",chnkPrefix,"V", gsub("-", "", Sys.Date()), ".pdf")
###############################################################################
## Create biologic Metadata object ##
if (file.exists(pipelineList[["biologicSettingsFN"]])){
dfObio <- read.csv(pipelineList[["biologicSettingsFN"]], header = F, stringsAsFactors = F)
} else {
stop("biologic settings file not found.")
}
dfObio <- data.frame(t(dfObio), stringsAsFactors = F)
dfObio[is.na(dfObio)] <- ""
colnames(dfObio) <- t(dfObio[1,])
dfObio <- dfObio[-1,]
for (i in 1:ncol(dfObio)){
pos <- grep("#", dfObio[,i], useBytes = TRUE)
if (length(pos) > 0){
dfObio[pos, i] <- ""
}
}
## ##
###############################################################################
###############################################################################
## dbDetailList ##
dbDetailList <- list(
"primDataDB" = as.vector(dfObio$primDataDB[1]),
"ref.cat.db" = as.vector(dfObio$ref.cat.db[1]),
"db.user" = as.vector(dfObio$db.user[1]),
"host" = as.vector(dfObio$host[1])
)
## End dbDetailList ##
###############################################################################
###############################################################################
## Project detail list ##
fastqFolders <- as.vector(dfObio$fastqFolders)
fastqFolders <- fastqFolders[fastqFolders != ""]
lastChr <- sapply(fastqFolders, function(x) substr(x, nchar(x), nchar(x)))
fastqFolders <- ifelse(lastChr != "/", paste0(fastqFolders, "/"), fastqFolders)
corGeneVec <- as.vector(dfObio$corGeneVec)
if (!(is.null(corGeneVec))){
corGecorGeneVec <- corGeneVec[corGeneVec != ""]
}
projectDetailList <- list(
"RSEMcountDataFile" = as.vector(dfObio$RSEMcountDataFile[1]),
"folder" = as.vector(dfObio$folder[1]),
"primaryAlignmentGeneID" = as.vector(dfObio$primaryAlignmentGeneID[1]),
"paired.end" = as.logical(dfObio$paired.end[1]),
"stranded" = as.logical(dfObio$stranded[1]),
"labname" = as.vector(dfObio$labname[1]),
"projectName" = as.vector(dfObio$project_name[1]),
"read.length" = as.vector(dfObio$read.length[1]),
"modelFN" = as.vector(dfObio$modelFN[1]),
"baseDesignFN" = as.vector(dfObio$baseDesignFN[1]),
"TpmTableFN" = as.vector(dfObio$TpmTableFN[1]),
"PcaFN" = as.vector(dfObio$PcaFN[1]),
"DEseq2_DGE_result_folder" = as.vector(dfObio$DEseq2_DGE_result_folder[1]),
"DEseq2_LRT_result_folder" = as.vector(dfObio$DEseq2_LRT_result_folder[1]),
"DEseq2External_DGE" = as.vector(dfObio$DEseq2External_DGE[1]),
"DEseq2External_LRT" = as.vector(dfObio$DEseq2External_LRT[1]),
"calculate_DGE" = as.logical(dfObio$calculate_DGE[1]),
"calculate_LRT" = as.logical(dfObio$calculate_LRT[1]),
"designFN" = as.vector(dfObio$designFN[1]),
"DGEmodelFN" = as.vector(dfObio$DGEmodelFN[1]),
"DEseq2betaPrior" = as.logical(dfObio$DEseq2betaPrior[1]),
"AlignFASTQcolumn" = as.vector(dfObio$AlignFASTQcolumn[1]),
"NtopGenes" = as.vector(dfObio$NtopGenes[1]),
"designTScol" = as.vector(dfObio$designTScol[1]),
"corGeneVec" = corGeneVec,
"batchMode" = as.logical(dfObio$batchMode[1]),
"parallelProcessing" = as.logical(dfObio$parallelProcessing[1]),
"ModuleFASTQC" = as.vector(dfObio$ModuleFASTQC[1]),
"countTableFN" = as.vector(dfObio$countTableFN[1]),
"ModuleTrimGalore" = as.vector(dfObio$ModuleTrimGalore[1]),
"TrimGaloreMinLength" = as.vector(dfObio$TrimGaloreMinLength[1]),
"TrimGaloreMinQuality" = as.vector(dfObio$TrimGaloreMinQuality[1]),
"lab.categories.table" = as.vector(dfObio$lab.categories.table[1]), # default NULL
"sra.id.vector" = as.vector(dfObio$sra.id.vector[1]),
"gse.id.vector" = as.vector(dfObio$gse.id.vector[1]),
"lims.id" = as.vector(dfObio$lims.id[1]),
"experiment.type" = as.vector(dfObio$experiment.type[1]),
"species" = as.vector(dfObio$species[1]),
"release" = as.vector(dfObio$release[1]),
"project_id" = as.vector(dfObio$project_id[1]),
"labname" = as.vector(dfObio$labname[1]),
"timecourse.units" = as.vector(dfObio$timecourse.units[1]),
"count.table.headline" = as.vector(dfObio$count.table.headline[1]),
"count.table.sidelabel" = as.vector(dfObio$count.table.headline[1]),
"heamap.headline.text" = as.vector(dfObio$count.table.headline[1]),
"pathToSeqStorageFolder" = fastqFolders,
"corGeneVec" = dfObio$corGeneVec[dfObio$corGeneVec != ""]
)
## End project detail list ##
###############################################################################
###############################################################################
## Project Parameters ##
documentationParams <- list(
"title" = as.vector(dfObio$title[1]),
"subtitle" = as.vector(dfObio$subtitle[1]),
"abstract" = as.vector(dfObio$abstract[1])
)
## Done Project Params ##
###############################################################################
###############################################################################
## Reference Table List ##
dfRefTab <- dfObio[,grep("referenceTableListDB", names(dfObio))]
referenceTableList = list()
if (ncol(dfRefTab) > 0){
for (i in 1:ncol(dfRefTab)){
referenceTableList[[as.vector(dfRefTab[1,i])]] <- as.vector(dfRefTab[2,i])
}
## To be added: Check tables against database
}
# mysigdb_sc_sig
# cibersort_L22
# Allen_Brain_Atlas |
# CORUM |
# ChEA_2016 |
# DEPOD_phosphatase_substrates |
# ENCODE_TF_ChIP_seq_2015 |
# GO_Biological_Process_2017 |
#LINCS_L1000_Chem_Pert_down |
#LINCS_L1000_Chem_Pert_down_backup |
# LINCS_L1000_Chem_Pert_up |
# LINCS_L1000_Chem_Pert_up_backup |
# NCBI_homologene_table |
# Old_CMAP_down |
# Old_CMAP_up |
# SGP_from_GEO_up_down_combined |
# SILAC_Phosphoproteomics |
# TRANSFAC_and_JASPAR_PWMs |
# UK_Biobank_GWAS |
# ag_lab_categories |
# as_lab_categories |
# bader_lab_hESC_reference |
# bt_lab_categories |
# cat_selection_default |
# cs_lab_categories |
# da_lab_categories |
# es_lab_categories |
# esl111_cat_reference_db_table |
# et_lab_categories |
# exploration_categories |
# fg_lab_categories |
# fi_lab_categories |
# gk_lab_categories |
# innateDB_PPI |
# jb_lab_categories |
# js_lab_categories |
# kn_lab_categories |
# mysigdb_c1_positional |
# mysigdb_c2_1329_canonical_pathways |
# mysigdb_c2_KEGG |
# mysigdb_c2_REACTOME |
# mysigdb_c2_biocarta |
# mysigdb_c2_chemical_and_genetic_pertubations |
# mysigdb_c3_TF_targets |
# mysigdb_c3_miRNA_targets
# et_lab_categories |
# | exploration_categories |
# | fg_lab_categories |
# | fgl391_cat_reference_db_table |
# | fi_lab_categories |
# | gk_lab_categories |
# | innateDB_PPI |
# | jb_lab_categories |
# | js_lab_categories |
# | kn_lab_categories |
# | mysigdb_c1_positional |
# | mysigdb_c2_1329_canonical_pathways |
# | mysigdb_c2_KEGG |
# | mysigdb_c2_REACTOME |
# | mysigdb_c2_biocarta |
# | mysigdb_c2_chemical_and_genetic_pertubations |
# | mysigdb_c3_TF_targets |
# | mysigdb_c3_miRNA_targets |
# | mysigdb_c5_BP |
# | mysigdb_c5_CC |
# | mysigdb_c5_MF |
# | mysigdb_c6_oncogenic_signatures |
# | mysigdb_c7_immunologic_signatures |
# | mysigdb_h_hallmarks |
# | networkcategories |
# | nl_lab_categories |
# | pa_lab_categories |
# | pb_lab_categories |
# | pfam_interpro |
# | pp_lab_categories |
# | project_db_table |
# | project_db_table_backup |
# | project_description_table |
# | pt_lab_categories |
# | re_lab_categories |
# | reference_categories_db_new |
# | rl_lab_categories |
# | sb_lab_categories |
# | sc_lab_categories |
# | sl_lab_categories |
# | sl_lab_categories_backup |
# | ss_lab_categories |
# | st_lab_categories |
# | temp_categories |
# | vp_lab_categories |
# | vt_lab_categories
## Done ##
###############################################################################
# Species has to be "mus_musculus", "homo_sapiens", "danio_rerio"
# release-86, release-89
## Create defaults ##
Obio = new(
"bioLOGIC",
documentationParams = documentationParams,
dbDetailList = dbDetailList,
projectDetailList = projectDetailList,
referenceTableList = referenceTableList,
parameterList = projectDetailList
)
# In the future this will be done using the more general
# Obio <- biologicSeqTools2::setGenomeAndGeneNameTable(Obio)
# function
## Temporary fix ##
if (Obio@parameterList$species == "homo_sapiens"){
Obio@parameterList$primaryAlignmentGeneID <- "ENSG"
Obio@parameterList$geneIDcolumn <- "hgnc_symbol"
} else if (Obio@parameterList$species == "mus_musculus"){
Obio@parameterList$primaryAlignmentGeneID <- "ENSMUSG"
Obio@parameterList$geneIDcolumn <- "mgi_symbol"
} else if (Obio@parameterList$species == "gallus_gallus"){
Obio@parameterList$primaryAlignmentGeneID <- "ENSGALG"
Obio@parameterList$geneIDcolumn <- "gg_symbol"
} else {
hstring <- unlist(strsplit(Obio@parameterList$species, "_"))
hstring2 <- hstring
hstring2[1] <- substr(hstring2[1], 1,2)
hstring2[2] <- substr(hstring2[2], 1,1)
hstring2 <- toupper(paste0(hstring2, collapse = ""))
primaryAlignmentGeneID <- paste0("ENS", hstring2, "G")
Obio@parameterList$primaryAlignmentGeneID <- primaryAlignmentGeneID
geneID <- paste0(substr(hstring, 1, 1), collapse = "")
geneID <- paste0(geneID, "_symbol")
Obio@parameterList$geneIDcolumn <- geneID
}
## Create analysis folders in the working directory
Obio <- biologicSeqTools2::createAnalysisFolders(
Obio
)
## Set additional parameters
Obio <- biologicSeqTools2::setDataBaseParameters(Obio)
## Create gene annotation if it is not present ##
## To be activated ##
species <- Obio@parameterList$species
species <- unlist(strsplit(species, "_"))
species[1] <- substr(species[1], 1, 1)
selString <- paste0(species, collapse = "")
selString <- paste0(selString, "_gene_ensembl")
if (is.null(Obio@parameterList$release)){
release <- "release-95"
} else {
release <- Obio@parameterList$release
}
## release95 is no longer available
if (release == "release-95"){
release <- "release-98"
}
releaseID <- gsub("release-", "", tolower(release))
## Lookup biomart url
lookupTable <- biomaRt::listEnsemblArchives()
lookupTable <- lookupTable[lookupTable$version == releaseID, ]
if (nrow(lookupTable) > 0){
biomartURL <- as.vector(lookupTable[1,"url"])
} else {
stop("No ensembl biomart table available for this release.")
}
primaryAlignmentGeneID <- Obio@parameterList$primaryAlignmentGeneID
geneIDcolumn <- Obio@parameterList$geneIDcolumn
Obio@parameterList$path2GeneIDtable <- paste0(projectDir, "data/project.gene.annotation.txt")
if (is.null(Obio@parameterList$path2GeneIDtable)){
dfAnno <- biologicSeqTools2::createGeneNameTable(
obj = Obio,
biomart = "ENSEMBL_MART_ENSEMBL",
selString = selString,
host= biomartURL,
primaryAlignmentGeneID = primaryAlignmentGeneID,
geneIDcolumn = geneIDcolumn
)
dfAnno[dfAnno[,geneIDcolumn] == "", geneIDcolumn] <- dfAnno[dfAnno[,geneIDcolumn] == "", primaryAlignmentGeneID]
write.table(dfAnno, paste0(projectDir, "data/project.gene.annotation.txt"), row.names=F, sep="\t")
Obio@parameterList$path2GeneIDtable <- paste0(projectDir, "data/project.gene.annotation.txt")
setTable <- TRUE
} else {
setTable <- FALSE
}
## This can be upgraded to web retrieval of annotation data
Obio <- biologicSeqTools2::addGeneAnnotation(Obio)
## For now we expect a gene annotation file to be specified already
#Obio <- biologicSeqTools2::setCrickGenomeAndGeneNameTable(Obio)
# In the future this will be done using the more general
# biologicSeqTools2::setGenomeAndGeneNameTable
# function
## Create analysis folders in the working directory
# Obio <- biologicSeqTools2::createAnalysisFolders(
# Obio
# )
#
# ## Set additional parameters
# Obio <- biologicSeqTools2::setDataBaseParameters(Obio)
#
# ## This can be upgraded to web retrieval of annotation data
# Obio <- biologicSeqTools2::addGeneAnnotation(Obio)
## Create shiny path for figure outputs ##
shinyURL <- paste0(
"https://",
Obio@parameterList[["shinyBaseServerURL"]],
"/shiny/boeings/",
Obio@parameterList$project_id,
"_app/"
)
## Create outputfolders ##
# if (!dir.exists(paste0(Obio@parameterList$localWorkDir,Obio@parameterList$project_id))){
# dir.create(paste0(Obio@parameterList$localWorkDir,Obio@parameterList$project_id))
# }
Obio@parameterList[["html_local"]] <- paste0(Obio@parameterList$folder, "html_local/")
if (!dir.exists(Obio@parameterList[["html_local"]])){
dir.create(Obio@parameterList[["html_local"]])
}
Obio@parameterList[["reportFigDir"]] <- paste0(Obio@parameterList$html_local, "report_figures/")
if (!dir.exists(Obio@parameterList$reportFigDir)){
dir.create(Obio@parameterList$reportFigDir)
}
pdfTemp <- paste0(Obio@parameterList$reportFigDir, "temp.pdf")
Obio@parameterList[["reportTableDir"]] <- paste0(Obio@parameterList$html_local, "report_tables/")
if (!dir.exists(Obio@parameterList$reportTableDir)){
dir.create(Obio@parameterList$reportTableDir)
}
## Create data dir
Obio@parameterList[["data_dir"]] <- paste0(Obio@parameterList$folder, "data/")
if (!dir.exists(Obio@parameterList$data_dir)){
dir.create(Obio@parameterList$data_dir)
}
## Set default for database connections ##
pos <- grep("^host$", names(Obio@dbDetailList))
if (length(pos) ==0 ){
Obio@dbDetailList$host <- NULL
if (is.null(Obio@dbDetailList)){
Obio@dbDetailList = list("host" = NULL)
}
upload.results.to.database <- FALSE
print("No database server provided. upload.results.to.database set to FALSE")
}
if (!is.null(Obio@dbDetailList$host)){
if (Obio@dbDetailList$host == "10.27.241.234"){
urlString <- "biologic.thecrick.org"
} else {
urlString <- "biologic.crick.ac.uk"
}
} else {
urlString <- ""
}
## Temporary fix functoin
#Obio <- biologicSeqTools2::setCrickGenomeAndGeneNameTable(Obio)
if (setTable){
Obio@parameterList$path2GeneIDtable <- paste0(projectDir, "data/project.gene.annotation.txt")
}
source("save.biologic.robj.R")
## ##
###############################################################################
With the parmeters below, you can import the data relating to this RNA-Seq projet directly into a R-session.
Here we set the database access variables so we can get your data in
username <- "demogtex_da"
pass <- "KGp2S7cC"
host <- "clvd1-db-u-p-31.thecrick.org"
db <- "demogtex_data"
port <- 6008
designTB <- "bulkliverdemo_designTable"
mainTB <- "bulkliverdemo_bulk_rna_seq_table"
countTB <- "bulkliverdemo_count_table"
pcaTB <- "bulkliverdemo_PCA"
species <- "homo_sapiens"
geneIDcolumn <- "hgnc_symbol"
alignmentGeneID <- "ENSG"
Next, let’s load your project data. We will need this as a basis for making the plots further down.
## The line below will install an R-package that we need to connect to the Crick database
devtools::install_github("decusinlabore/bioLOGIC")
library(bioLOGIC)
## Load the design table from database. Here we will retrieve information on samples.
dfDesign <- import.db.table.from.db(
dbname = db,
dbtable = designTB,
host = host,
user = username,
password = pass,
port = port
)
## Load main data table from database. In this table a lot of gene-level information for this project is assembled.
dfMainData <- import.db.table.from.db(
dbname = db,
dbtable = mainTB,
host = host,
user = username,
password = pass,
port = port
)
## Load RSEM count data table.
dfCountData <- import.db.table.from.db(
dbname = db,
dbtable = countTB,
host = host,
user = username,
password = pass,
port = port
)
## Load main pca table from database. This table contains cell-level PCA information.
dfPCA <- import.db.table.from.db(
dbname = db,
dbtable = pcaTB,
host = host,
user = username,
password = pass,
port = port
)
## For some plots we want to limit the number of genes to the most interesting, so let's get those in a vector:
# Most variable gene names
dfVar <- dfMainData[dfMainData$logFC_cut_off != 0 ,c(geneIDcolumn, alignmentGeneID)]
mostVarGenes <- as.vector(unique(sort(dfVar[,geneIDcolumn])))
mostVarGenes <- na.omit(mostVarGenes[mostVarGenes != ""])
# Most variable gene IDs
mostVarIDs <- as.vector(unique(sort(dfVar[,alignmentGeneID])))
mostVarIDs <- na.omit(mostVarIDs[mostVarIDs != ""])
chnkPrefix <- "add.data"
VersionPdfExt <- VersionPdfExt <- paste0(".",chnkPrefix,"V", gsub("-", "", Sys.Date()), ".pdf")
## [1] "RSEM file not (correctly) specified"
For a summary of the alignment parameters, review the QC tab.
The following tables form the basis for this analysis:
Read count table:
/nemo/stp/babs/working/boeings/Projects/boeings/stefan.boeing/621_demo_liverdemo_bulkRNAseq/data/rsem.count.txt
TPM table:
/nemo/stp/babs/working/boeings/Projects/boeings/stefan.boeing/621_demo_liverdemo_bulkRNAseq/data/dfTPM.txt
Quality Control (QC)
You may want to start by reviewing the quality of the underlying RNA and RNA-sequencing. This is done in the Quality control (QC) section. Things to look for are the percentage of reads that aligned to intergenic regions (indicating DNA contamination in your RNA) and the sequence duplication rate (which might be a PCR artefact introduced during sample amplification and may affect the complexity of the sequenced library).
cat(paste0("A summary of this experiment and its results so far is summarized in an editable [report](https://biologic.crick.ac.uk/",Obio@parameterList$project_id,"/report.php). The purpose of this section is to provide a brief summary of the experiment as well as of its results to a person who is interested in your dataset in the distant future. Ideally, this section contains all information necessary to understand the experiment as well as its results so far. You may download this presentation via the link at the bottom of the page, edit it in whatever way you deem suitable and send it back to bioinformatics for updating. You may also want to add a powerpoint presentation outlining the rationale for this experiment as a [slide show](https://biologic.crick.ac.uk/",Obio@parameterList$project_id,"/about.php)."))
A summary of this experiment and its results so far is summarized in an editable report. The purpose of this section is to provide a brief summary of the experiment as well as of its results to a person who is interested in your dataset in the distant future. Ideally, this section contains all information necessary to understand the experiment as well as its results so far. You may download this presentation via the link at the bottom of the page, edit it in whatever way you deem suitable and send it back to bioinformatics for updating. You may also want to add a powerpoint presentation outlining the rationale for this experiment as a slide show.
dfDesign <- Obio@dfDesign
selVec <- c(
names(dfDesign)[grep('sampleID', names(dfDesign))],
names(dfDesign)[grep('original.sample.name', names(dfDesign))],
names(dfDesign)[grep('sample.id', names(dfDesign))],
names(dfDesign)[grep('^sample.group$', names(dfDesign))],
names(dfDesign)[grep('^dataseries$', names(dfDesign))],
names(dfDesign)[grep('^f_', names(dfDesign))],
names(dfDesign)[grep('comp_', names(dfDesign))],
names(dfDesign)[grep('LRT_', names(dfDesign))]
)
selVec <- selVec[selVec %in% names(Obio@dfDesign)]
if (length(selVec) > 1){
dfDesign <- unique(dfDesign[,selVec])
}
colnames <- gsub("_", " ", names(dfDesign))
colnames <- gsub("comp", "DGE", colnames)
colnames <- gsub("^f_", "Factor_", colnames)
colnames <- gsub("[.]", " ", colnames)
colnames <- gsub("original.sample.name", "original", colnames)
DT::datatable(
dfDesign,
colnames = colnames,
rownames = FALSE,
options = list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color': '#000', 'color': '#fff'});",
"}"
) #,
#order = list(list(3, 'asc'), list(2, 'desc'))
)
)
Table 1: Design table for this experiment outlining original file names, sample names and group definitions for differential gene expression analysis.
figCap2 = paste0(
"**Table ",
Obio@parameterList$tableCount,
":** Design table for this experiment outlining original file names, sample names and group definitions for differential gene expression analysis"
)
dfModel <- Obio@dfModel
# selVec <- c(
# names(dfDesign)[grep('sampleID', names(dfDesign))],
# names(dfDesign)[grep('original.sample.name', names(dfDesign))],
# names(dfDesign)[grep('sample.id', names(dfDesign))],
# names(dfDesign)[grep('^f_', names(dfDesign))]
# names(dfDesign)[grep('comp_', names(dfDesign))],
# names(dfDesign)[grep('LRT_', names(dfDesign))]
#
# )
#
# selVec <- selVec[selVec %in% names(Obio@dfDesign)]
#
# dfDesign <- unique(dfDesign[,selVec])
# colnames <- gsub("_", " ", names(dfDesign))
# colnames <- gsub("comp", "DGE", colnames)
# colnames <- gsub("[.]", " ", colnames)
# colnames <- gsub("original.sample.name", "original", colnames)
if (nrow(dfModel) > 0){
DT::datatable(
dfModel,
#colnames = colnames,
rownames = FALSE,
options = list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color': '#000', 'color': '#fff'});",
"}"
),
order = list(list(3, 'asc'), list(2, 'desc'))
)
)
}
Table 2: Design table for this experiment outlining original file names, sample names and group definitions for differential gene expression analysis
###############################################################################
## Create and attach DDS object ##
if (is.null( Obio@parameterList$parallelProcessing) || length(Obio@parameterList$parallelProcessing) == 0){
Obio@parameterList$parallelProcessing <- FALSE
}
Obio <- biologicSeqTools2::createDdsObject(Obio)
## Done creating and attaching DDS objecs ##
###############################################################################
###############################################################################
## Determine variation in the data set ##
## This is a tempory fix until the PCA function has been fully equipped for
## batch correction
saveMode <- Obio@parameterList$batchMode
Obio@parameterList$batchMode <- FALSE
Obio <- biologicSeqTools2::addCoVar(
obj = Obio,
avgCountCutOffperSample = 1,
selectionColName = "aboveCutOff",
dfBaseData = Obio@DESeqNormReadCountsTable,
rowNameID = Obio@parameterList$primaryAlignmentGeneID
#options: "DEseq2RV" or "CoVar"
)
## Results are in slot Obio@dataTableList$dfRowVar ##
## Diagnostic plot ##
# library(ggplot2)
# countCutOff <- 0
# pCoVar <- ggplot(
# data=Obio@dataTableList$dfRowVar[Obio@dataTableList$dfRowVar$avgCountsPerGenePerSample > countCutOff ,],
# aes(
# x=CoVar,
# y=DEseq2RV
# )) + geom_point(
# ) + labs(title = "Variation" ,x = "CoVar", y = "DeSeq2Var"
# ) + theme(
# axis.text.y = element_text(size=8),
# axis.text.x = element_text(size=8),
# axis.title.y = element_text(size=8),
# axis.title.x = element_text(size=8),
# axis.line = element_line(colour = "black"),
# panel.border = element_rect(colour = "black", fill=NA, size=1),
# plot.title = element_text(hjust = 0.5, size = 12)
# )
## Done determine variation in the data set ##
###############################################################################
###############################################################################
## Perform PCA, MV-analysis, and Clusterdendrogram ##
## Select elements for PCA explicityly ##
dfSel <- Obio@dataTableList$dfRowVar
## Remove low count values ##
dfSel <- dfSel[dfSel$avgCountsPerGenePerSample >= 1, ]
## Order by variation
dfSel <- dfSel[order(dfSel$DEseq2RV, decreasing = TRUE),]
# dfSel <- dfSel[order(dfSel$CoVar, decreasing = TRUE),]
rowSelVec <- as.vector(
dfSel[1:Obio@parameterList$NtopGenes,Obio@parameterList$primaryAlignmentGeneID]
)
Obio@dataTableList[["Ntop4pcaGeneSelection"]] <- rowSelVec
Obio <- doPCA(
obj = Obio,
pcaDimensionsToInvestigate = c(1:7),
Ntop4pca = Obio@parameterList$NtopGenes, #500,
avgCountCutOffperSample = 0,
pcaSelectionVec = rowSelVec
)
Obio@parameterList$batchMode <- saveMode
## Do linear fittings to available dimensions ##
# depdendent Variation mode: Variables can be dependent (individual fits)
if (length(unique(Obio@dfDesign$dataseries)) > 1){
independentDesignColSector <- c(
"dataseries"
)
} else {
independentDesignColSector <- as.vector(NULL, mode = "character")
}
if (length(Obio@dfDesign$sample.group) > 1){
independentDesignColSector <- c(
independentDesignColSector,
"sample.group"
)
}
if (length(Obio@dfDesign$replicate) > 1){
independentDesignColSector <- c(
independentDesignColSector,
"replicate"
)
}
pos <- grep("^f_", names(Obio@dfDesign))
if (length(pos) > 0){
independentDesignColSector <- c(
names(Obio@dfDesign)[grep("^f_", names(Obio@dfDesign))]
)
} else if (length(unique(Obio@dfDesign$dataseries)) > 1){
independentDesignColSector <- c(
"dataseries"
)
}
# indedendentVariation mode: Variables are required to be independent
## Do linear variable fittings to PCA ##
# variables need to be independent > culmulative fit #
Obio <- biologicSeqTools2::doLinearFittings(
obj = Obio,
designColSelector = independentDesignColSector,
mode = "independentVariation", ## "independentVariation" or "dependentVariation"
Ntop4pca = Obio@parameterList$NtopGenes,
plotname = "P2_independentVariables"
)
## Do linear variable fittings to PCA ##
# individual fit
LRTvec <- names(Obio@dfDesign)[grep("f_", names(Obio@dfDesign))]
#LRTvec <- LRTvec[LRTvec != "LRT_replicate"]
#LRTvec <- "LRT_timepoint"
dependentDesignColSelector<- c(
independentDesignColSector,
LRTvec,
names(Obio@dfDesign)[grep("comp_", names(Obio@dfDesign))],
names(Obio@dfDesign)[grep("LRT_", names(Obio@dfDesign))]
)
Obio <- biologicSeqTools2::doLinearFittings(
obj = Obio,
designColSelector = dependentDesignColSelector,
mode = "dependentVariation", ## "independentVariation" or "dependentVariation"
Ntop4pca = Obio@parameterList$NtopGenes,
plotname = "P2_dependentVariables"
)
## Create Plot dendrogram ##
# Obio <- createSampleDendrogram(
# obj = Obio,
# Ntop4pca = Obio@parameterList$NtopGenes,
# plotname = "dendrogram10000"
# )
###############################################################################
## Do differential gene expression analysis ##
#Obio@parameterList$batchMode <- F
## LRT tests for multiple sample groups ##
# if (!is.null(Obio@projectDetailList$DEseq2_DGE_result_folder) && Obio@projectDetailList$DEseq2_DGE_result_folder != ""){
# mode = "load_DGE_from_file"
# Obio <- loadDESeq2outputFromFile(
# Obio,
# replace = TRUE
# )
# } else {
#}
############################################################
## Function load DGE results from file
loadDESeq2outputFromFile <- function(
Obio,
DEseq2resultDir = "Obio@parameterList$DEseq2External_DGE",
replace = FALSE,
mode = NULL # Can be DGE or LRT
){
if (is.null(mode)){
if (DEseq2resultDir == "DEseq2External_LRT"){
mode = "LRT"
} else {
mode = "DGE"
}
}
allfiles <- paste0(DEseq2resultDir, "/", list.files(DEseq2resultDir))
#allfiles <- allfiles[grep(".txt", allfiles)]
allfiles <- gsub("//", "/", allfiles)
allfiles <- sort(allfiles)
contrastNames <- gsub(gsub("//", "/", paste0(DEseq2resultDir, "/")), "", allfiles)
contrastNames <- gsub(".txt", "", contrastNames)
###############################################################################
## For this project only: Add ENSMUSG column ##
dfAnno <- Obio@dfGeneAnnotation
dfAnno <- unique(
Obio@dfGeneAnnotation[,c(Obio@parameterList$primaryAlignmentGeneID, Obio@parameterList$geneIDcolumn)]
)
## Done adding ENSMUSG column ##
###############################################################################
for (i in 1:length(allfiles)){
colName <- contrastNames[i]
res <- read.delim(allfiles[i], header = T, sep="\t")
## In case gene_ids were saved in row names
pos <- grep("gene_id", names(res))
if (length(pos) == 0){
res[["gene_id"]] <- row.names(res)
} else if (length(pos) == 1){
row.names(res) <- res$gene_id
}
################################
## This project only
selVec <- c("gene_id", "baseMean", "log2FoldChange", "lfcSE", "pvalue", "padj")
if (!sum(selVec %in% names(res)) == length(selVec)){
stop(paste0("Check DESeq2 input files. Mandantory columns: ", paste0(selVec, collapse = ", ")))
}
res <- unique(res[,selVec])
res <- res[!(duplicated(res[,"gene_id"])),]
## Remove all log-fcs with an NA padj
res <- res[!is.na(res$padj),]
res <- res[!is.na(res$log2FoldChange),]
res[is.na(res)] <- 0
res <- res[res$baseMean > 0,]
# Plus one to avoid negative log2 baseMeans
res$baseMean <- log2((res$baseMean + 1))
## Done
################################
#row.names(res) <- res[,Obio@parameterList$primaryAlignmentGeneID]
names(res) = paste(names(res), colName, sep="_")
names(res) <- gsub(paste0("gene_id_", colName), "gene_id", names(res))
names(res) <- gsub("^gene_id$", Obio@parameterList$primaryAlignmentGeneID, names(res))
#res[[Obio@parameterList$primaryAlignmentGeneID]] = rownames(res)
## log2 the base mean for lrt applications
if (mode == "DGE"){
names(res) = gsub("log2FoldChange", "logFC", names(res))
names(res) = gsub(
"logFC",
paste("contrast_D", i, "_logFC", sep=""),
names(res)
)
names(res) = gsub(
"padj",
paste("contrast_D", i, "_padj", sep=""),
names(res)
)
names(res) = gsub(
"stat",
paste("contrast_D", i, "_stat", sep=""),
names(res)
)
names(res) = gsub(
"baseMean",
paste("contrast_D", i, "_lg2BaseMean", sep=""),
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
res[,padj.col] <- as.numeric(res[,padj.col])
## Add log10p column ##
padj <- names(res)[grep("_padj_", names(res))]
lg10p <- gsub("padj", "lg10p", padj)
for (z in 1:length(padj)){
preprocess <- as.numeric(res[,padj[z]])
minNum <- min(preprocess[preprocess != 0])
preprocess[preprocess == 0] <- minNum
# if (length(grep("padj_LRT", padj[i])) > 0){
# preprocess <- as.numeric(res[,padj[z]])
# minNum <- min(preprocess[preprocess != 0])
# preprocess[preprocess == 0] <- minNum
# } else {
# preprocess <- as.numeric(res[,padj[z]])
# }
temp <- -1*log10(preprocess)
#temp[temp >= 50] = 50
res[,lg10p[z]] <- temp
}
col.vector = c(
Obio@parameterList$primaryAlignmentGeneID,
names(res)[grep("contrast", names(res))]
)
res = res[,col.vector]
## Make all numeric columns numeric ##
res[,grep("contrast_", names(res))] <- apply(res[,grep("contrast_", names(res))], 2, as.numeric)
}
if (mode == "LRT"){
#res[[Obio@parameterList$primaryAlignmentGeneID]] = rownames(res)
res$stat <- NULL
names(res) = gsub(
"baseMean",
paste0("contrast_L_lg2BaseMean_"),
names(res)
)
names(res) = gsub(
"padj",
paste0("contrast_L_padj_"),
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
res[,padj.col] <- as.numeric(res[,padj.col])
## Add log10p column ##
padj <- names(res)[grep("_padj_", names(res))]
lg10p <- gsub("padj", "lg10p", padj)
for (z in 1:length(padj)){
preprocess <- as.numeric(res[,padj[z]])
minNum <- min(preprocess[preprocess != 0])
preprocess[preprocess == 0] <- minNum
# if (length(grep("padj_LRT", padj[i])) > 0){
# preprocess <- as.numeric(res[,padj[z]])
# minNum <- min(preprocess[preprocess != 0])
# preprocess[preprocess == 0] <- minNum
# } else {
# preprocess <- as.numeric(res[,padj[z]])
# }
temp <- -1*log10(preprocess)
#temp[temp >= 50] = 50
res[,lg10p[z]] <- temp
}
col.vector = c(
Obio@parameterList$primaryAlignmentGeneID,
names(res)[grep("contrast", names(res))]
)
res = res[,col.vector]
## Make all numeric columns numierc ##
## Make all numeric columns numierc ##
res[,grep("contrast_", names(res))] <- apply(res[,grep("contrast_", names(res))], 2, as.numeric)
} # end LRT mode
if (i == 1){
dfContrastTable <- res
} else {
dfContrastTable <- merge(
dfContrastTable,
res,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
dfContrastTable[is.na(dfContrastTable)] <- 0
}
}
# head(dfContrastTable)
dfContrastTable<- dfContrastTable[rowSums(dfContrastTable[,2:ncol(dfContrastTable)]) != 0, ]
names(dfContrastTable) <- gsub("__", "_", names(dfContrastTable))
if (mode == "LRT"){
if (replace){
Obio@DEseq2LRTtable <- data.frame(NULL)
Obio@DEseq2LRTtable <- dfContrastTable
} else if (nrow(Obio@DEseq2LRTtable) > 0){
Obio@DEseq2LRTtable <- merge(
Obio@DEseq2LRTtable,
dfContrastTable,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
Obio@DEseq2LRTtable[is.na(Obio@DEseq2LRTtable)] <- 0
} else {
Obio@DEseq2LRTtable <- dfContrastTable
}
}
if (mode == "DGE"){
if (replace){
Obio@DEseq2contrastTable <- data.frame(NULL)
Obio@DEseq2contrastTable <- dfContrastTable
} else if (nrow(Obio@DEseq2contrastTable) > 0){
Obio@DEseq2contrastTable <- merge(
Obio@DEseq2contrastTable,
dfContrastTable,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
Obio@DEseq2contrastTable[is.na(Obio@DEseq2contrastTable)] <- 0
} else {
Obio@DEseq2contrastTable <- dfContrastTable
}
}
if (mode == "LRT"){
print("DESeq2 results loaded into Obio@DEseq2LRTtable")
} else {
print("DESeq2 results loaded into Obio@DEseq2contrastTable")
}
return(Obio)
}
## Done with function ##
###############################################################################
###############################################################################
## Add or calculate LRT results ##
if (!is.null(Obio@parameterList$calculate_DGE) && is.logical(Obio@parameterList$calculate_DGE)) {
calculate_LRT <- Obio@parameterList$calculate_LRT
} else {
calculate_LRT <- TRUE
}
pos <- grep("LRT_", names(Obio@dfDesign))
if (length(pos) == 0){
calculate_LRT <- FALSE
}
if (calculate_LRT){
dfDGE <- Obio@dfModel
dfDGE <- dfDGE[dfDGE$test == "LRT",]
if (nrow(dfDGE) > 0){
Obio <- biologicSeqTools2::LRTanalysis(
obj = Obio,
createNewResultTable = TRUE
)
}
}
## Add additional, external LRT files, if available
lrtFilesToAdd <- NULL
if (!is.null(Obio@parameterList$DEseq2External_LRT)){
lrtFilesToAdd <- list.files(Obio@parameterList$DEseq2External_LRT)
}
if (length(lrtFilesToAdd) > 0){
Obio <- loadDESeq2outputFromFile(
Obio,
replace = TRUE,
mode = "LRT",
DEseq2resultDir = Obio@parameterList$DEseq2External_LRT
)
}
## Done adding and/or calculating LRT results ##
################################################################################
################################################################################
## Add calculate DGE results ##
if (!is.null(Obio@parameterList$calculate_DGE) && is.logical(Obio@parameterList$calculate_DGE)) {
calculate_DGE <- Obio@parameterList$calculate_DGE
} else {
calculate_DGE <- TRUE
}
pos <- grep("comp_", names(Obio@dfDesign))
if (length(pos) == 0){
calculate_DGE <- FALSE
}
if (calculate_DGE){
## Pairwise differential gene expression ##
## If working on projects prior to fall 2018, make sure
## Obio@parameterList$DEseq2betaPrior is set to true.
dfDGE <- Obio@dfModel
dfDGE <- dfDGE[dfDGE$test == "Wald",]
if (nrow(dfDGE) > 0){
Obio <- biologicSeqTools2::DGEanalysis(
obj = Obio,
createNewResultTable = TRUE
)
}
}
## If mode is not calculate DGE, a dge table needs to be provided
dgeFilesToAdd <- NULL
if (!is.null(Obio@parameterList$DEseq2External_DGE)){
dgeFilesToAdd <- list.files(Obio@parameterList$DEseq2External_DGE)
}
if (length(dgeFilesToAdd) > 0){
Obio <- loadDESeq2outputFromFile(
Obio,
replace = FALSE,
mode = "DGE",
DEseq2resultDir = Obio@parameterList$DEseq2External_DGE
)
}
## Done adding and/or calculating LRT results ##
################################################################################
## Done differential gene expresison analysis ##
###############################################################################
###############################################################################
## Create dfSummary ##
dfDGE <- Obio@DEseq2contrastTable
## Temporary Fix ##
## Remove empty pase means
rmVec <- grep("lg2BaseMean$", names(dfDGE))
if (length(rmVec) > 0){
dfDGE <- dfDGE[,-rmVec]
}
## lg2 base mean ##
lg2Vec <- grep("lg2BaseMean", names(dfDGE))
if (length(lg2Vec ) > 0){
for (i in 1:length(lg2Vec ))
dfDGE[,lg2Vec[i]] <- log2(dfDGE[,lg2Vec[i]])
}
dfLRT <- Obio@DEseq2LRTtable
dfPCA <- Obio@dfPCAgenes
df.summary <- dfDGE
###############################################################################
## Calculate correlations ##
## Adding annotation ##
dfAnno <- unique(Obio@dfGeneAnnotation[,c(Obio@parameterList$primaryAlignmentGeneID, Obio@parameterList$geneIDcolumn)])
dfAnno <- dfAnno[dfAnno[,Obio@parameterList$primaryAlignmentGeneID] %in% df.summary[,Obio@parameterList$primaryAlignmentGeneID],]
pos <- grep("corGeneVec", names(Obio@parameterList))
if (length(pos) == 0){
Obio@parameterList[["corGeneVec"]] <- NULL
}
if (!is.null(Obio@parameterList$corGeneVec)){
hVec <- Obio@parameterList$corGeneVec
dfAnnoCor <- dfAnno[dfAnno[,Obio@parameterList$geneIDcolumn] %in% hVec, ]
Obio@parameterList$corGeneVec <- as.vector(dfAnnoCor[,Obio@parameterList$geneIDcolumn])
}
if (exists("dfAnnoCor")){
if (nrow(dfAnnoCor) > 0){
dfTPM <- Obio@dfTPM
names(dfTPM) <- gsub("^X$", Obio@parameterList$primaryAlignmentGeneID, names(dfTPM))
names(dfTPM) <- gsub("^gene_id$", Obio@parameterList$primaryAlignmentGeneID, names(dfTPM))
names(dfTPM) <- paste0("norm_counts_", names(dfTPM ))
names(dfTPM) <- gsub(
paste0(
"norm_counts_", Obio@parameterList$primaryAlignmentGeneID),
Obio@parameterList$primaryAlignmentGeneID,
names(dfTPM)
)
row.names(dfTPM) <- dfTPM[,Obio@parameterList$primaryAlignmentGeneID]
dfTPM[,Obio@parameterList$primaryAlignmentGeneID] <- NULL
for (k in 1:nrow(dfAnnoCor)){
###############################################################################
## do correlation analysis ##
pValueCor = rep(1, nrow(dfTPM))
corCoef = rep(0, nrow(dfTPM))
cor.method = "pearson"
geneSel <- as.vector(dfAnnoCor[k, Obio@parameterList$primaryAlignmentGeneID])
pattern <- as.numeric(dfTPM[geneSel, ])
#Find best correlation with kinase expression
print("Starting to calculate correlations...")
for (i in 1:nrow(dfTPM)){
samplePattern <- as.numeric(t(dfTPM[i,]))
if (sum(samplePattern) != 0){
cor.test.result = cor.test(samplePattern, pattern, method=cor.method)
pValueCor[i] = cor.test.result$p.value
corCoef[i] = cor.test.result$estimate
}
if (i%%1000 == 0){
print(i)
}
}
print("...done.")
dfTPM[["pValueCor"]] <- pValueCor
dfTPM[["corCoef"]] <- corCoef
dfTPM <- dfTPM[order(dfTPM$corCoef, decreasing = TRUE),]
dfTempRes <- dfTPM
dfTempRes[[Obio@parameterList$primaryAlignmentGeneID]] <- row.names(dfTempRes)
dfTempRes <- dfTempRes[,c("corCoef", Obio@parameterList$primaryAlignmentGeneID)]
names(dfTempRes) <- gsub("corCoef", paste0("corCoef_", as.vector(dfAnnoCor[k, Obio@parameterList$geneIDcolumn])), names(dfTempRes))
if (k==1){
dfTRes <- dfTempRes
} else {
dfTRes <- merge(
dfTRes,
dfTempRes,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all =TRUE
)
dfTRes[is.na(dfTRes)] <- 0
}
## Done correlation analysis ##
###############################################################################
}
Obio@dataTableList[["geneCorTables"]] <- dfTRes
}
}
## Done calculate correlations ##
###############################################################################
###############################################################################
## Adding annotation to df.summary ##
dfNormCounts <- Obio@DESeqNormReadCountsTable
names(dfNormCounts) <- paste0("DEseq2NormalizedReadCounts_", names(dfNormCounts))
dfNormCounts[[Obio@parameterList$primaryAlignmentGeneID]] <- row.names(dfNormCounts)
dfTPM <- Obio@dfTPM
## In case other names are used as col names
names(dfTPM) <- gsub("^X$", Obio@parameterList$primaryAlignmentGeneID, names(dfTPM))
names(dfTPM) <- gsub("^gene_id$", Obio@parameterList$primaryAlignmentGeneID, names(dfTPM))
names(dfTPM) <- paste0("norm_counts_", names(dfTPM ))
names(dfTPM) <- gsub(
paste0("norm_counts_", Obio@parameterList$primaryAlignmentGeneID),
Obio@parameterList$primaryAlignmentGeneID,
names(dfTPM)
)
## Done adding annotation ##
###############################################################################
if (nrow(dfLRT) > 0){
df.summary <- merge(
df.summary,
dfLRT,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
df.summary[is.na(df.summary)] <- 0
}
df.summary <- merge(
df.summary,
dfTPM,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
df.summary[is.na(df.summary)] <- 0
df.summary <- merge(
df.summary,
dfNormCounts,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
df.summary[is.na(df.summary)] <- 0
df.summary <- merge(
df.summary,
dfPCA,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
df.summary[is.na(df.summary)] <- 0
###############################################################################
## Add correlation bits, if they exist ##
## Check gene level correlations ##
if (length(grep("geneCorTables", names(Obio@dataTableList))) > 0){
dfAdd <- Obio@dataTableList$geneCorTables
names(dfAdd)[grep("corCoef_", names(dfAdd))] <- paste0("add_venn_X_", names(dfAdd)[grep("corCoef_", names(dfAdd))])
df.summary <- merge(
df.summary,
dfAdd,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
df.summary[is.na(df.summary)] <- 0
}
## Check ts correlations.
if (length(grep("tsCorTables", names(Obio@dataTableList))) > 0){
dfAdd <- Obio@dataTableList$tsCorTables
df.summary <- merge(
df.summary,
dfAdd,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
df.summary[is.na(df.summary)] <- 0
}
## Done with correlations ##
###############################################################################
## Adding annotation ##
dfAnno <- Obio@dfGeneAnnotation
dfAnno <- dfAnno[dfAnno[,Obio@parameterList$primaryAlignmentGeneID] %in% df.summary[,Obio@parameterList$primaryAlignmentGeneID],]
df.summary <- merge(
df.summary,
dfAnno,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
df.summary[is.na(df.summary)] <- 0
df.summary[df.summary[,Obio@parameterList$geneIDcolumn] == 0, Obio@parameterList$geneIDcolumn] <- df.summary[df.summary[,Obio@parameterList$geneIDcolumn] == 0, Obio@parameterList$primaryAlignmentGeneID]
## Done creating dfSummary ##
###############################################################################
###############################################################################
## Add Variation measures ##
if (!is.null(Obio@dataTableList$dfRowVar)){
dfVarMeasures <- Obio@dataTableList$dfRowVar
df.summary <- merge(
df.summary,
dfVarMeasures,
by.x = Obio@parameterList$primaryAlignmentGeneID,
by.y = Obio@parameterList$primaryAlignmentGeneID,
all = TRUE
)
df.summary[is.na(df.summary)] <- 0
}
## Done with variations ##
###############################################################################
###############################################################################
# Upload to website #
###############################################################################
#library(SBwebtools)
###############################################################################
# Prepare database table #
###############################################################################
###############################################################################
## Default Heatmap option A Ntop most variable genes ##
###############################################################################
## Option A Select most variable genes ##
df.summary[["logFC_cut_off"]] <- 0
rowSelVec <- as.vector(
dfSel[1:Obio@parameterList$NtopGenes,Obio@parameterList$primaryAlignmentGeneID]
)
df.summary[df.summary[, Obio@parameterList$primaryAlignmentGeneID] %in% rowSelVec, "logFC_cut_off"] <- 1
## Select for heatmap ##
# df.summary <- selectHeatmapGenes(
# dfData = df.summary,
# cutOff = 1.3,
# zeroOneCol = "logFC_cut_off",
# selCol = "contrast_1_lg10p",
# geneID = Obio@parameterList$geneIDcolumn
# )
## Select for heatmap all genes with a TPM row sum of 2 or higher ##
# df.summary[["logFC_cut_off"]] <- 0
# df.summary[,"logFC_cut_off"] <- rowSums(df.summary[,grep("norm_counts_", names(df.summary))])
# nSamples <- length(unique(dfDesign$sample.id))
# df.summary[,"logFC_cut_off"] <- ifelse(df.summary$logFC_cut_off >= 5*nSamples, 1, 0)
## Select for heatmap: abs change of at least 0.5 in any contrast ##
###############################################################################
## Create main database table ##
Obio@databaseTable <- biologicSeqTools2::datatable.to.website.ptm(
df.data = df.summary,
gene.id.column = Obio@parameterList$primaryAlignmentGeneID,
heatmap.genes = "",
n.cluster.genes = 2000,
count.data = TRUE,
logFC.cut.off = 1,
#use.logFC.columns.for.heatmap = FALSE,
selector4heatmap.cols = "norm_counts",
heatmap.preprocessing = "lg2.row.avg", # possible: "lg2", "lg2.row.avg", "none"
hm.cut.off = 4,
n.hm.cluster = 10,
count.cut.off.filter = 0
)
## Done creating main database table ##
###############################################################################
###############################################################################
## Create Excel output files ##
addedOutputCols <- c(
names(Obio@databaseTable)[grep("corCoef", names(Obio@databaseTable))],
names(Obio@databaseTable)[grep("chromosome_name", names(Obio@databaseTable))]
)
if (Obio@parameterList$geneIDcolumn != "hgnc_symbol"){
addedOutputCols <- c(
addedOutputCols,
"hgnc_symbol"
)
}
createAndFormatExcelOutputFiles(
obj = Obio,
metaCoreCountFilter = 1,
customOutputCols = NULL,
addedOutputCols = addedOutputCols
)
## Done creating Excel output files ##
###############################################################################
###############################################################################
## Add Covar figure ##
# figureCol is DEseq2RV or CoVar
figureCol <- "DEseq2RV"
dfDat <- unique(
Obio@databaseTable[,c(Obio@parameterList$geneIDcolumn, "DEseq2RV", "CoVar")]
)
dfDat[["Var"]] <- dfDat[,figureCol]
dfDat <- dfDat[order(dfDat$Var, decreasing = TRUE),]
dfDat <- dfDat[dfDat$Var > 0, ]
dfDat[["CoVarOrder"]] <- 1:nrow(dfDat)
# Obio@plotCollection[["CoVar"]] <- ggplot(
# data=dfDat,
# aes(x=CoVarOrder, y=Var)
# ) + geom_line( ) + geom_vline(xintercept = Obio@parameterList$NtopGene, col="red"
# ) + theme(
# axis.text.y = element_text(size=8),
# axis.text.x = element_text(size=8),
# axis.title.y = element_text(size=8),
# axis.title.x = element_text(size=8),
# axis.line = element_line(colour = "black"),
# panel.border = element_rect(colour = "black", fill=NA, size=1),
# plot.title = element_text(hjust = 0.5, size = 12)
# ) + labs(title = paste0("Variation Seen Across all Genes")
# )
## Done adding Covar figure ##
###############################################################################
###############################################################################
## Add additional plot columns from database ##
## Done adding additional plot columns ##
###############################################################################
###############################################################################
# Do GSEA #
###############################################################################
database.table2 <- Obio@databaseTable
rmVec <- c(
#grep("contrast_2", names(database.table2))
grep("contrast_P", names(database.table2)),
grep("contrast_L", names(database.table2)),
grep("LRT_", names(database.table2)),
grep("PCA_", names(database.table2)),
grep("norm_counts_", names(database.table2)),
grep("intercept_", names(database.table2)),
grep("r2_P", names(database.table2)),
grep("DEseq2NormalizedReadCounts", names(database.table2)),
grep("p_value_P", names(database.table2)),
grep("lg2_avg_", names(database.table2))
)
if (length(rmVec) > 0){
database.table2 <- database.table2[,-rmVec]
}
# #
# ## Remove unnecessary columns, if needed ##
# #
# ## Create GSEA rank files ##
# biologicSeqTools2::create.gsea.rnk.files(
# Obio@parameterList$localWorkDir,
# df.dataTable = database.table2,
# GSEA.colum.type = "_logFC_",
# gene.symbol.column.name = "hgnc_symbol"
# )
# #
# # ## Remove last character from file ##
# # #truncate -s -2 file
# # #sed '$d' file # remove last line
# #
# # ## Remove last character from file ##
# # #truncate -s -2 file
# # #sed '$d' file # remove last line
# #
# ## Function to create gmt file ##
if (!is.null(Obio@parameterList$GSEAtables)){
tables <- Obio@parameterList$GSEAtables
} else {
tables <- c(
"mysigdb_h_hallmarks",
"mysigdb_c5_BP" #,
#Obio@parameterList$lab.categories.table
)
}
# #
dfRefGmt <- create.gmt.file.from.ref.data.table(
host = Obio@dbDetailList$host,
dbname = "reference_categories_db_new",
dataTable = tables,
pwd = db.pwd,
user=Obio@dbDetailList$db.user,
gene.id.column = "hgnc_symbol"
)
###############################################################################
## If dfRefGmt is very large, reduce to most variable genes ##
## Define relevant genes for selection ##
relevant.genes <- as.vector(
unique(
Obio@databaseTable[Obio@databaseTable$cluster_order, "hgnc_symbol"]
)
)
lr <- length(relevant.genes)
if (nrow(dfRefGmt) > 10000 & lr > 100){
TF <- apply(dfRefGmt[,3:ncol(dfRefGmt)], 1, function(x) sum(as.vector(x) %in% relevant.genes) )
selector <- 0
selVec <- TF >= selector
while(sum(selVec) > 10000){
selVec <- TF > selector
selector <- selector + 1
if (selector > 199){
stop()
}
}
dfRefGmt <- dfRefGmt[selVec, ]
print(paste0(selector, " gene cut-off set for dfRefGmt. ", sum(selVec), " categories used for GSEA per sample."))
}
## Done ##
###############################################################################
# #
# # ###############################################################################
# # ## Save gmt file ##
# # #"/camp/stp/babs/working/boeings/Projects/reference_data/GSEA.gmt.files/20160508.rna.seq.txn.analysis.gmt"
# #
localGmtDir <- paste0(
Obio@parameterList$localWorkDir,
"GSEA/"
)
if (!exists(localGmtDir)){
dir.create(localGmtDir)
}
#
gmtDir<- paste0(
Obio@parameterList$workdir,
"GSEA/"
)
gmtFileName <- paste0(
Obio@parameterList$project_id,
".",
"projectGmtFile.gmt"
)
dfRefGmt <- dfRefGmt[!(duplicated(dfRefGmt[,1])),]
write.table(
dfRefGmt,
paste0(localGmtDir, gmtFileName),
col.names = FALSE,
row.names = FALSE,
sep="\t",
quote = F
)
# #
contrasts <- names(database.table2)[grep("logFC",names(database.table2))]
contrasts <- contrasts[contrasts != "logFC_cut_off"]
contrasts
GSEAfn <- paste0(
Obio@parameterList$localWorkDir,
"/GSEA/GSEAcommands.sh"
)
sink(GSEAfn)
cat("module load Java/1.8.0_131");cat("\n");cat("\n")
for (i in 1:length(contrasts)){
gmtFile <- paste0(gmtDir, gmtFileName)
contrastNo <- unlist(strsplit(contrasts[i], "_"))[2]
nTopPlots <- 50
GSEAdir <- paste0(Obio@parameterList$workdir, "GSEA")
rnkFile <- paste0(GSEAdir, "/",contrasts[i],".rnk")
gseaCMD <- paste0(
"sbatch --time=03:00:00 --wrap '",
"module load Java/1.8.0_131;",
"java -Xmx2512m -cp /camp/stp/babs/working/boeings/Projects/software/gsea-3.0.jar xtools.gsea.GseaPreranked -gmx ",
gmtFile,
" -rnk ",
rnkFile,
" -rpt_label ",
"contrast_",
contrastNo,
"_rnaSeqTxnTest",
" -out ",
GSEAdir,
" -collapse false -mode Max_probe -norm meandiv -nperm 1000 -scoring_scheme classic -include_only_symbols true -make_sets true -plot_top_x ",
nTopPlots,
" -rnd_seed timestamp -set_max 2500 -set_min 10 -zip_report false -gui false",
"' --job-name='GSEA_",contrastNo,"' --mem=50G -o GSEA_",contrastNo,".slurm >> commands.txt"
)
cat(gseaCMD);cat("\n");cat("\n");
}
sink()
#
chnkVec <- as.vector(NULL, mode = "character")
plotList <- list()
###############################################################################
## Create Clusterdendrogram ##
tag <- paste0("Clusterdendrogram")
colSelVec <- c(
alignmentGeneID,
names(dfMainData)[grep("norm_counts", names(dfMainData))]
)
geneSelVec <- Obio@dataTableList[["Ntop4pcaGeneSelection"]]
geneSelVec <- geneSelVec[geneSelVec != duplicated(geneSelVec)]
dfData <- unique(dfMainData[, colSelVec])
dfData <- dfData[dfData[,alignmentGeneID] %in% geneSelVec, ]
row.names(dfData) <- dfData[,alignmentGeneID]
dfData[,alignmentGeneID] <- NULL
names(dfData) <- gsub("norm_counts_", "", names(dfData))
names(dfData) <- gsub("_TPM", "", names(dfData))
c <- cor(as.matrix(dfData), method="pearson")
d <- as.dist(1-c)
hr <- hclust(d, method = "ward.D", members=NULL)
plotList[[tag]] <- ggdendro::ggdendrogram(
hr,
rotate = TRUE,
size = 4,
theme_dendro = FALSE,
color = "tomato"
) + ggplot2::theme_bw(
) + ggplot2::theme(
axis.text.y = ggplot2::element_text(size=8),
axis.text.x = ggplot2::element_text(size=8),
axis.title.y = ggplot2::element_text(size=8),
axis.title.x = ggplot2::element_text(size=8),
axis.line = ggplot2::element_line(colour = "black"),
panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=1),
plot.title = ggplot2::element_text(hjust = 0.5, size = 12)
)
FNbase <- paste0(tag, VersionPdfExt)
FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(plotList[[tag]])
dev.off()
## ##
###########################################################################
figCap <- paste0(
'**Figure ',
figureCount,
'Sample Dendrogram:** Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>.'
)
figureCount <- figureCount + 1
NewChnk <- paste0(
paste0(
"### Cluster Dendrogram \n"
),
"\n```{r SampleDendrogram, results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(plotList[['",tag,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
## Done creating clusterdendrogram ##
###############################################################################
###########################################################################
## Add Coefficient of variation plot ##
if (length(grep("CoVar", names(dfMainData))) > 0){
tag <- "CoVar_Plot"
figureCol = "DEseq2RV"
dfDat <- unique(
dfMainData[,c( geneIDcolumn, "DEseq2RV", "CoVar")]
)
dfDat[["Var"]] <- dfDat[,figureCol]
dfDat <- dfDat[order(dfDat$Var, decreasing = TRUE),]
dfDat <- dfDat[dfDat$Var > 0, ]
dfDat[["CoVarOrder"]] <- 1:nrow(dfDat)
if (!exists("NtopGene")){
NtopGene <- length(mostVarIDs)
}
plotList[[tag]] <- ggplot2::ggplot(
data=dfDat,
ggplot2::aes(x=CoVarOrder, y=Var)
) + ggplot2::geom_line( ) + ggplot2::geom_vline(xintercept = NtopGene, col="red"
) + ggplot2::theme_bw() + ggplot2::theme(
axis.text.y = ggplot2::element_text(size=8),
axis.text.x = ggplot2::element_text(size=8),
axis.title.y = ggplot2::element_text(size=8),
axis.title.x = ggplot2::element_text(size=8),
axis.line = ggplot2::element_line(colour = "black"),
panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=1),
plot.title = ggplot2::element_text(hjust = 0.5, size = 12)
) + ggplot2::labs(title = paste0("Variation Seen Across all Genes")
)
###########################################################################
## Save plot to file ##
FNbase <- paste0("CoVar", VersionPdfExt)
FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(plotList[[tag]])
dev.off()
## ##
###########################################################################
link <- paste0(
'An interactive version of this figure can be found ',
'<a href="https://', urlString,'/',Obio@projectDetailList$project_id,'/scatterplot?x_axis=CoVarOrder&y_axis=CoVar&headline=2D+Scatterplot" target="_blank">here</a>', '. ')
figLegend <- paste0(
'**Figure ',
figureCount,
':** ',
' Coefficient of variation per gene. The red line indicates the cut-off for the most variable genes in this experiment. The most variable genes are the basis for the PCA analysis and heatmap displays. Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>. ',
link
)
figureCount <- figureCount + 1
NewChnk <- paste0(
paste0("### Coefficient of Variation \n"),
"\n```{r CoVarPlot, results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figLegend,"'}\n",
"\n",
"\n print(plotList[['",tag,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
}
## Done adding coefficient of variation ##
###########################################################################
if (length(plotList) > 2){
tabVar <- ".tabset .tabset-fade .tabset-dropdown"
} else {
tabVar <- ".tabset .tabset-fade .tabset-pills"
}
Figure 1Sample Dendrogram: Download a pdf of this figure here.
Figure 2: Coefficient of variation per gene. The red line indicates the cut-off for the most variable genes in this experiment. The most variable genes are the basis for the PCA analysis and heatmap displays. Download a pdf of this figure here. An interactive version of this figure can be found here.
###############################################################################
## Add PCA plot ##
if (!exists("project_id")){
project_id <- gsub("_designTable", "", designTB)
}
if (!exists("VersionPdfExt")){
VersionPdfExt <- paste0(".V", gsub("-", "", Sys.Date()), ".pdf")
}
if (!exists("labname")){
labname <- "TBD"
}
if (!exists("reportFigDir") || is.null(reportFigDir)){
reportFigDir <- ""
}
chnkVec <- as.vector(NULL, mode = "character")
plotList <- list()
tag <- "PCAvariationPerDimension"
if (exists("Obio")){
pos <- grep("PCApercentVar", slotNames(Obio))
if (!is.null(Obio@PCApercentVar)){
PCApercentVar <- Obio@PCApercentVar
}
} else {
PCApercentVar <- NULL
}
## Use custom PCA colors if specified ##
## Just in case we still have dots instead of underscores
names(dfPCA) <- gsub("\\.", "_", names(dfPCA))
pcaSampleGroups <- unique(sort(dfPCA$sample_group))
## If sample.group colors are set use those, otherwise set default.
pos <- grep("^sample.group_color$", names(dfDesign))
if (length(pos) == 0){
## Create default ##
sample.group <- unique(dfDesign$sample.group)
sample.group_color <- sample.group
#library(scales)
sample.group_color = scales::hue_pal()(length(sample.group_color))
#sample.group_color = c("#990000", "#009900")
## set sample group colors manually
dfGroupColors <- unique(data.frame(sample.group, sample.group_color))
dfDesign <- merge(dfDesign, dfGroupColors, by.x = "sample.group", "sample.group")
}
dfColor <- unique(
Obio@dfDesign[,c("sample.group", "sample.group_color")]
)
if (nrow(dfColor) == length(pcaSampleGroups)){
namedColors <- dfColor$sample.group_color
names(namedColors) <- dfColor$sample.group
plotList[[tag]] <- ggplot2::ggplot(
data = dfPCA,
ggplot2::aes(x=PC1, y=PC2, fill = sample_group)
) + ggplot2::geom_vline(xintercept = 0, color = "grey", size=0.5
) + ggplot2::geom_hline(yintercept = 0, color = "grey", size=0.5
) + ggplot2::geom_point(
size=2,
shape = 21
) + ggplot2::scale_fill_manual("Sample Groups", values = namedColors
)
} else {
plotList[[tag]] <- ggplot2::ggplot(
data = dfPCA,
ggplot2::aes(x=PC1, y=PC2, fill = sample_group)
) + ggplot2::geom_vline(xintercept = 0, color = "grey", size=0.5
) + ggplot2::geom_hline(yintercept = 0, color = "grey", size=0.5
) + ggplot2::geom_point(
size=2,
shape = 21
)
}
if (!is.null(PCApercentVar)){
plotList[[tag]] <- plotList[[tag]] + ggplot2::labs(
title = "PCA Plot",
x = paste0("PC1 \n ",round(100* Obio@PCApercentVar[1]),"% variability explained"),
y = paste0("PC2 \n ",round(100* Obio@PCApercentVar[2]),"% variability explained")
)
} else {
plotList[[tag]] <- plotList[[tag]] + ggplot2::labs(
title = "PCA Plot",
x = paste0("PC1"),
y = paste0("PC2")
)
}
plotList[[tag]] <- plotList[[tag]] + ggplot2::theme_bw() + ggplot2::theme(
axis.text.y = ggplot2::element_text(size=8),
axis.text.x = ggplot2::element_text(size=8),
axis.title.y = ggplot2::element_text(size=12),
axis.title.x = ggplot2::element_text(size=12),
axis.line = ggplot2::element_line(colour = "black"),
panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=1),
plot.title = ggplot2::element_text(hjust = 0.5, size = 12)
)
###########################################################################
## Save plot to file ##
FNbase <- paste0("PCA12", VersionPdfExt)
FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
FNrelT <- paste0("report_tables/", FNbase)
pdf(FN)
print(plotList[[tag]])
dev.off()
## ##
###########################################################################
link <- paste0('<a href="https://biologic.crick.ac.uk/',project_id,'/pca?x_axis=PC1&y_axis=PC2', '" target="_blank">here</a>')
figCap <- paste0(
"**Figure ",
figureCount,
":** Variation in the first two PCA Dimensions. Download a pdf of this figure [here](", FNrel, "). ",
"Further PCA dimensions are available interacively ", link, ". "
)
figureCount <- figureCount + 1
NewChnk <- paste0(
paste0("### PCA_Plot \n"),
"\n```{r ReferencePCA1, results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(plotList[['",tag,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
## Done with PCA plot ##
###############################################################################
###############################################################################
## Add Variation estimate plot ##
if (length(unique(dfDesign$dataseries)) > 1){
independentDesignColSector <- c(
"dataseries"
)
} else {
independentDesignColSector <- as.vector(NULL, mode = "character")
}
if (length(dfDesign$sample.group) > 1){
independentDesignColSector <- c(
independentDesignColSector,
"sample.group"
)
}
if (length(dfDesign$replicate) > 1){
independentDesignColSector <- c(
independentDesignColSector,
"replicate"
)
}
pos <- grep("^f_", names(dfDesign))
if (length(pos) > 0){
independentDesignColSector <- c(
names(dfDesign)[grep("^f_", names(dfDesign))]
)
} else if (length(unique(dfDesign$dataseries)) > 1){
independentDesignColSector <- c(
"dataseries"
)
}
###############################################################################
## Create independent variations plot ##
designColSelector = unique(c(independentDesignColSector, "sample.id"))
if (length(unique(dfDesign$sample.id)) > 42) {
rld <- vst(dds)
} else {
rld <- rlog(dds)
}
rv = rowVars(assay(rld))
## Select most variable genes
select = order(rv, decreasing = TRUE)[seq_len(length(mostVarIDs))]
dfTemp = t(assay(rld)[select, ])
pc <- prcomp(dfTemp, center=TRUE, scale=FALSE)
colDatMin = unique(dfDesign[, designColSelector])
rownames(colDatMin) = as.vector(colDatMin$sample.id)
colDatMin$sample.id <- NULL
#colnames(colData)[1] = "condition"
###############################################################################
## Get PCA Loadings ##
dfBase <- t(dfTemp)
pcaGenes = prcomp(scale(dfBase))
dfPcaGenes = data.frame(pcaGenes$x)
if (ncol(dfPcaGenes) > 10){
dfPcaGenes <- dfPcaGenes[,1:10]
}
dfPcaGenes[[ alignmentGeneID]] <- row.names(dfPcaGenes)
#Obio@dfPCAgenes <- dfPcaGenes
dfPcaGenes <- Obio@dfPCAgenes
## Retrieve pca loadings from previous step
## Add Gene Annotation
dfAnno <- unique(Obio@dfGeneAnnotation[,c( alignmentGeneID, geneIDcolumn)])
dfAnno <- dfAnno[dfAnno[,alignmentGeneID] %in% dfPcaGenes[,alignmentGeneID], ]
dfLoad <- merge(
dfAnno,
dfPcaGenes,
by.x = alignmentGeneID,
by.y = alignmentGeneID,
all = TRUE
)
dfLoad[is.na(dfLoad)] <- 0
dfLoad[dfLoad[,geneIDcolumn] == 0, geneIDcolumn] <- dfLoad[dfLoad[,geneIDcolumn] == 0, alignmentGeneID]
## Make Loadings Plot ##
## Plot ##
selXY <- c("contrast_P_PCA_estimatePCA1", "contrast_P_PCA_estimatePCA2", geneIDcolumn)
dfSel <- unique(dfLoad[,selXY])
#row.names(dfSel) <- dfSel$gene
dfSel[["highlight"]] <- ""
dfSel[["cat"]] <- ""
dfSel[["selX"]] <- ""
dfSel[["selY"]] <- ""
dfSel <- dfSel[order(dfSel[,selXY[1]], decreasing = FALSE), ]
dfSel[1:15, "highlight"] <- "+"
## Use two standard deviations for enrichment ##
twoSD <- 2*sd(dfSel[,selXY[1]])
twoSDxLine <- 2*sd(dfSel[,selXY[1]])
gSvec <- dfSel[dfSel[,selXY[1]] < -1* twoSD, geneIDcolumn]
dfSel <- dfSel[order(dfSel[,selXY[1]], decreasing = TRUE), ]
dfSel[1:15, "highlight"] <- "+"
gSvec <- dfSel[dfSel[,selXY[1]] > twoSD, geneIDcolumn]
## Now dim 2
dfSel <- dfSel[order(dfSel[,selXY[2]], decreasing = FALSE), ]
dfSel[1:15, "highlight"] <- "+"
twoSD <- 2*sd(dfSel[,selXY[2]])
twoSDyLine <- 2*sd(dfSel[,selXY[2]])
gSvec <- dfSel[dfSel[,selXY[2]] < -1* twoSD, geneIDcolumn]
dfSel <- dfSel[order(dfSel[,selXY[2]], decreasing = TRUE), ]
dfSel[1:15, "highlight"] <- "+"
gSvec <- dfSel[dfSel[,selXY[2]] > twoSD, geneIDcolumn]
dfSel[["label"]] <- ""
dfSel[dfSel$highlight == "+", "label"] <- dfSel[dfSel$highlight == "+", geneIDcolumn]
## Done
tag <- "PCA_Loadings"
colVec <- c("grey", "black")
names(colVec) <- c("", "Selected")
plotList[[tag]] <- ggplot2::ggplot(data=dfSel, aes_string(x=selXY[1],y=selXY[2], label="label")
) + geom_vline(xintercept = 0, color = "grey", size=0.5
) + geom_hline(yintercept = 0, color = "grey", size=0.5
) + geom_vline(xintercept = c(twoSDxLine, -1* twoSDxLine), color = "red", lty=2,size=0.5
) + geom_hline(yintercept = c(twoSDyLine, -1* twoSDyLine), color = "red", lty=2,size=0.5
) + geom_hline(yintercept = 0, color = "grey", size=0.5
) + geom_point(col="black") + scale_color_manual(values=colVec
#) + ggtitle(paste0("PCA - Cell Level")
) + theme_bw(
) + theme(
#axis.text.y = element_blank(), # element_text(size=8),
#axis.text.x = element_blank(), #element_text(size=8),
#axis.title.y = element_blank(), #element_text(size=8),
#axis.title.x = element_blank(), #element_text(size=8),
axis.line = element_line(colour = "black"),
legend.position = "none",
panel.border = element_rect(colour = "black", fill=NA, size=1),
#plot.title = element_text(hjust = 0.5, size = 12)
) #+ guides(col = guide_legend(override.aes = list(shape = 16, size = legendDotSize)))
#points <- as.vector(unique(dfSel[dfSel$highlight=="+", geneIDcolumn]))
#plotListGene[[tag]] <- LabelPoints(plot = plotListGene[[tag]], points =points, repel = TRUE, xnudge = 0, ynudge = 0)
plotList[[tag]] <- plotList[[tag]] + ggrepel::geom_text_repel(size = 3)
## Save to file ##
FNbase <- paste0(tag, ".", selXY[1],".", selXY[2], VersionPdfExt)
FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
plot(plotList[[tag]])
dev.off()
png 2
# dim1 <- gsub("PC_", "", xVec[i])
# dim2 <- gsub("PC_", "", yVec[i])
link <- paste0(
'<a href="https://',urlString,'/',
project_id,
'/scatterplot?x_axis=contrast_P_PCA_estimatePCA1&y_axis=contrast_P_PCA_estimatePCA2&highlight_gene=&cat_id=ag_lab_categories__10',
'" target="_blank">here</a>'
)
figCap <- paste0(
"**Figure, " ,figureCount,":**Gene-level PCA plot for dimensions ", selXY[1], " and ", selXY[2], ". ",
". An interactive version of this figure can be found ", link, ". "
)
NewChnk <- paste0(
"### PCA_Loadings \n",
"\n```{r PCA_gene_level , results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",
figCap,"'}\n",
"\n",
"\n print(plotList[['",tag,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
## Done with genes ##
###########################################################################
figureCount <- figureCount + 1
###############################################################################
## Add percent variaton per dimension plot ##
tag <- "Variation_Per_PCA_Dimension"
## Add percent variation plot ##
PercentVariation <- round(100*Obio@PCApercentVar,1)
PCdimension <- paste0("PC", 1:length(PercentVariation))
df <- data.frame(
PercentVariation,
PCdimension
)
legendString <- ""
if (nrow(df) > 20){
legendString <- paste0("Only the first 20 principal components out of ",nrow(df)," are shown in the figure. ")
df <- df[1:20,]
PCdimension <- PCdimension[1:20]
}
df <- df[df$PercentVariation > 0,]
plotList[[tag]] <- ggplot(
df,
aes(PCdimension, PercentVariation)
) + geom_col(
) + scale_x_discrete(limits=PCdimension) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12)
) + theme_bw()
###########################################################################
## Save plot to file ##
FNbase <- paste0(tag, VersionPdfExt)
FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
plotList[[tag]]
dev.off()
png 2
## ##
###########################################################################
figCap <- paste0(
'**Figure ',
figureCount,
':** Percent of variaton explained by each principal component. ',
legendString,
'Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>. '
)
figureCount <- figureCount + 1
NewChnk <- paste0(
"### Amount of variation explained by each PCA Dimension ",
"\n```{r, results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(plotList[['",tag,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
## Done ##
###############################################################################
covar_PC_frame <- rbind(
data.frame(
Component=1:(nrow(pc$x)-1),
spread(
data.frame(
v=names(colDatMin),
val=NA_real_
),
key=v,
value=val
)
)
)
tag <- "independentVariation"
#if (mode == "independentVariation"){
covar_PC_frame <- covar_PC_frame[c("Component", names(colDatMin))]
for (i in 1:nrow(covar_PC_frame)) {
## old code from gavin below ##
fit <- lm(pc$x[,i]~., data=colDatMin)
covar_PC_frame[i,-1] <- drop1(fit, test="F")[names(covar_PC_frame)[-1],"Pr(>F)"]
## replaced 25032019 ##
# Fit each variable individually @
}
plotFrame <- gather(covar_PC_frame, key=Covariate, value=p, -Component)
plotFrame <- plotFrame[order(plotFrame$Component, decreasing = FALSE),]
if (nrow(plotFrame) > 20) {
plotFrame <- plotFrame[1:(length(names(colDatMin)) * 20), ]
}
## Cut to 10 dimensions ##
plotList[[tag]] <- ggplot(
plotFrame,
aes(x=Component, y=Covariate, fill=-log10(p))) +
geom_raster() +
scale_fill_gradient(low="grey90", high="red") +
theme_classic() +
coord_fixed() +
scale_x_continuous( labels = unique(plotFrame$Component), breaks = unique(plotFrame$Component)
)
###########################################################################
## Save plot to file ##
FNbase <- paste0("Independent.variation.per.pca.dimension", VersionPdfExt)
FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(plotList[[tag]])
dev.off()
png 2
## ##
###########################################################################
figCap <- paste0(
'**Figure ',
figureCount,
':** Independent sources of Variation per principal component. ',
'Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>. '
)
figureCount <- figureCount + 1
NewChnk <- paste0(
"### Independent Source of Variation Per PCA Component ",
"\n```{r var-per-pca-independent, results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(plotList[['",tag,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
###############################################################################
## Now the plot tolarating dependent variations ##
tag <- "dependentVariation"
dependentDesignColSelector<- c(
independentDesignColSector,
names(Obio@dfDesign)[grep("comp_", names(dfDesign))],
names(Obio@dfDesign)[grep("LRT_", names(dfDesign))]
)
covar_PC_frame <- rbind(
data.frame(
Component=1:(nrow(pc$x)-1),
spread(
data.frame(
v=names(colDatMin),
val=NA_real_
),
key=v,
value=val
)
)
)
mode <- "dependentVariation"
## Do fitting individually ##
## Check that all selVec entries exist
fitVars <- names(covar_PC_frame)
fitVars <- fitVars[fitVars != "Component"]
covar_PC_frame <- covar_PC_frame[c("Component", names(colDatMin))]
for (i in 1:nrow(covar_PC_frame)) {
## old code from gavin below ##
for (j in 1:length(fitVars)){
corVar <- fitVars[j]
if (length(unique(dfDesign[, corVar])) > 1) {
pcDim <- paste0("pc$x[,",i,"]")
regressionFormula <- as.formula(paste(pcDim, corVar, sep="~"))
fit <- lm(regressionFormula, data=colDatMin)
pVal <- as.vector(summary(fit)$coefficients[,4][2])
covar_PC_frame[i, corVar] <- pVal
}
}
}
plotFrame <- gather(covar_PC_frame, key=Covariate, value=p, -Component)
plotFrame <- plotFrame[order(plotFrame$Component, decreasing = FALSE),]
if (nrow(plotFrame) > 20) {
plotFrame <- plotFrame[1:(length(names(colDatMin)) * 20), ]
}
## Cut to 10 dimensions ##
plotList[[tag]] <- ggplot(
plotFrame,
aes(x=Component, y=Covariate, fill=-log10(p))) +
geom_raster() +
scale_fill_gradient(low="grey90", high="red") +
theme_classic() +
coord_fixed() +
scale_x_continuous( labels = unique(plotFrame$Component), breaks = unique(plotFrame$Component)
) + ggplot2::theme(
axis.text.y = ggplot2::element_text(size=8),
axis.text.x = ggplot2::element_text(size=8),
axis.title.y = ggplot2::element_text(size=8),
axis.title.x = ggplot2::element_text(size=8),
axis.line = ggplot2::element_line(colour = "black"),
panel.border = ggplot2::element_rect(colour = "black", fill=NA, size=1),
plot.title = ggplot2::element_text(hjust = 0.5, size = 12)
) + ggplot2::labs(title = "Independent Sources of Variation per PCA Component")
###########################################################################
## Save plot to file ##
FNbase <- paste0("Dependent.permissive.variation.per.pca.dimension", VersionPdfExt)
FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
plotList[[tag]]
dev.off()
png 2
## ##
###########################################################################
figCap <- paste0(
'**Figure ',
figureCount,
':** Dependent-tolerant sources of Variation per principal component. ',
'Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>. '
)
figureCount <- figureCount + 1
NewChnk <- paste0(
"### Dependent-tolerant Source of Variation Per PCA Component ",
"\n```{r, results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(plotList[['",tag,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
## Done ##
###############################################################################
## Add PCA loadings
## Add genes driving this PCA dimension ##
# if (!is.null(Obio@plotCollection$PCA1_PCA_fitting)){
#
# pFit <- Obio@plotCollection$PCA1_PCA_fitting
#
#
# ###########################################################################
# ## Save plot to file ##
# FNbase <- paste0("Variation.per.pca.dimension.", VersionPdfExt)
# FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
# FNrel <- paste0("report_figures/", FNbase)
#
# pdf(FN)
# print(pFit)
# dev.off()
# ## ##
# ###########################################################################
# link <- paste0(
# "https://biologic.crick.ac.uk/",
# project_id,
# "/scatterplot?x_axis=contrast_P_PCA_estimatePCA1&y_axis=contrast_P_lg10p_PCA1&highlight_gene=&cat_id=ag_lab_categories__10")
# figCap <- paste0(
# "**Figure ",
# figureCount,
# ":** Genes driving principal components. ",
# "Download a pdf of this figure [here](", FNrel, "). ",
# "Genes driving this - and other PCA dimensions can be accessed interactively [here](", link, "). "
# )
#
# figureCount <- figureCount + 1
#
# NewChnk <- paste0(
# "### Genes Driving PCA Components ",
# "\n```{r var-per-pca-genes, results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
# "\n",
# "\n print(pFit)",
# "\n cat( '\n')",
# "\n\n\n```\n"
# )
# chnkVec <- c(
# chnkVec,
# NewChnk
# )
#
# }
# }
## Done adding PCA plots ##
###############################################################################
# if (length(plotList) > 2){
# tabVar <- ".tabset .tabset-fade .tabset-dropdown"
# } else {
# tabVar <- ".tabset .tabset-fade .tabset-pills"
# }
A birds eye view of your data can be obtained by looking at the results of the principal component analysis (PCA). The principal component analysis looks at your count dataset as a whole and determines how ‘close’ two samples are in terms of overall data structure. First of all, you want your replicated to cluster together. After that, you will be able to determine how different various sets of sample groups are.
A more detailed explanation on PCA is give in this PCA video.
Figure, 4:Gene-level PCA plot for dimensions contrast_P_PCA_estimatePCA1 and contrast_P_PCA_estimatePCA2. . An interactive version of this figure can be found here.
Figure 5: Percent of variaton explained by each principal component. Only the first 20 principal components out of 238 are shown in the figure. Download a pdf of this figure here.
Figure 6: Independent sources of Variation per principal component. Download a pdf of this figure here.
Figure 7: Dependent-tolerant sources of Variation per principal component. Download a pdf of this figure here.
## Make heatmap gene list
logFCselections <- names(dfMainData)[grep("_logFC_", names(dfMainData))]
padjSelections <- gsub("_logFC_", "_padj_", logFCselections)
dfSelections <- data.frame(logFCselections, padjSelections)
dfSelections <- dfSelections[dfSelections[,"padjSelections"] %in% names(dfMainData),]
if (nrow(dfSelections) > 2){
tabVar <- ".tabset .tabset-fade .tabset-dropdown"
} else {
tabVar <- ".tabset .tabset-fade .tabset-pills"
}
## Make heatmap gene list
logFCselections <- names(dfMainData)[grep("_logFC_", names(dfMainData))]
padjSelections <- gsub("_logFC_", "_padj_", logFCselections)
dfSelections <- data.frame(logFCselections, padjSelections)
dfSelections <- dfSelections[dfSelections[,"padjSelections"] %in% names(dfMainData),]
if (!exists("project_id")){
project_id <- gsub("_designTable", "", designTB)
}
if (!exists("VersionPdfExt")){
VersionPdfExt <- paste0(".V", gsub("-", "", Sys.Date()), ".pdf")
}
if (!exists("labname")){
labname <- "TBD"
}
if (!exists("reportFigDir") || is.null(reportFigDir)){
reportFigDir <- ""
}
###############################################################################
## First heatmap: Most variable genes ##
HMplotList <- list()
chnkVec <- as.vector(NULL, mode="character")
if (geneIDcolumn == "mgi_symbol" | geneIDcolumn == "hgnc_symbol"){
geneSelCol <- geneIDcolumn
} else {
geneSelCol <- "hgnc_symbol"
}
# if (is.null(HmDisplayCatsFromDb)){
HmDisplayCatsFromDb <- list()
# }
## Start with Nmost variable genes ##
if (exists("Ntop4pcaGeneSelection") && !is.null(Ntop4pcaGeneSelection) |
length(Ntop4pcaGeneSelection) > 3){
dfDataTable <- dfMainData
geneVec <- as.vector(unique(dfDataTable[dfDataTable[,alignmentGeneID] %in% Ntop4pcaGeneSelection,geneIDcolumn]))
} else {
geneVec <- unique(dfMainData[dfMainData$logFC_cut_off == 1, geneIDcolumn])
Ntop4pcaGeneSelection <- geneVec
}
cat.name <- paste0("Experiment_",project_id, "_",length(Ntop4pcaGeneSelection),"_most_variable_genes")
cat.description.text <- paste0(
"In this gene set the ",
length(geneVec),
" most variable genes from ",
labname,
" lab experiment \\<a href=\\'https:\\/\\/biologic.crick.ac.uk\\/",
project_id,"\\'\\>",project_id, "\\<\\/a\\> are compiled."
)
HmDisplayCatsFromDb[[cat.name]] <- list(
"cat_type" = paste0("temp_", project_id),
"data_source" = paste0(labname, " Lab") ,
"cat.description.text" = cat.description.text,
"geneVec" = geneVec,
"catID" = NULL,
"comparisonID" = NULL
)
###########################################################################
## Make one heatmap per comparison ##
numextract <- function(string){
stringr::str_extract(string, "contrast_\\-*\\d+\\.*\\d*_")
}
dfSelections[["designColumn"]] <- sapply(dfSelections$padjSelections, function(x) unlist(strsplit(x, "padj_"))[2])
## Get design column from model file ##
designColNames <- sapply(dfSelections$padjSelections, function(x) unlist(strsplit(x, "padj_"))[2])
modelComp <- as.vector(dfModel$comparison)
designColNames[!(designColNames %in% modelComp)] <- ""
dfModelSel <- dfModel[dfModel$comparison %in% designColNames,]
dfSelections[["designColumn"]] <- ""
if (nrow(dfModelSel) > 0){
## replace all entries found in dfModel to comparisonID
for (i in 1:nrow(dfModelSel)){
designColNames <- gsub(paste0("^", as.vector(dfModel[i, "comparison"]), "$"), as.vector(dfModel[i, "comparisonID"]),designColNames )
}
dfSelections[["designColumn"]] <- designColNames
}
for (k in 1:nrow(dfSelections)){
dfDataTable <- dfMainData
padjCutOff <- 0.05
geneVec <- as.vector(
unique(
dfDataTable[dfDataTable[,as.vector(dfSelections$padjSelections[k])] < 0.05 & dfDataTable[,as.vector(dfSelections$logFCselections[k])] != 0,geneIDcolumn]
)
)
if (length(geneVec) > 1500){
padjCutOff <- 0.01
geneVec <- as.vector(
unique(
dfDataTable[dfDataTable[,as.vector(dfSelections$padjSelections[k])] < 0.01 &
dfDataTable[,as.vector(dfSelections$logFCselections[k])] != 0,geneIDcolumn
]
)
)
}
## Insert gene set into database ##
cat.name <- paste0(
"Experiment_",project_id, "_",dfSelections$padjSelections[k],"_smaller_than_", gsub("[.]", "_", padjCutOff)
)
cat.description.text <- paste0(
"In this gene set the genes that exhibited an adjusted p value of less than ",
padjCutOff,
" in the differential gene expression comparsion ",
as.vector(dfSelections$logFCselections[k]),
" in ",
labname,
" lab experiment \\<a href=\\'https:\\/\\/biologic.crick.ac.uk\\/",project_id,"\\'\\>",project_id, "\\<\\/a\\> are compiled."
)
comparisonID <- as.vector(dfSelections[k, "designColumn"])
if (comparisonID == ""){
comparisonID <- NULL
}
HmDisplayCatsFromDb[[cat.name]] <- list(
"cat_type" = paste0("temp_", project_id),
"data_source" = paste0(labname, " Lab") ,
"cat.description.text" = cat.description.text,
"geneVec" = geneVec,
"catID" = NULL,
"comparisonID" = comparisonID
)
}
## Done with making heatmap list ##
###########################################################################
[1] “Retrieved category: Transcription_Factors_MC” [1] "Retrieved category ID: ag_lab_categories__10" [1] “Retrieved category: LIANA_Ligand_Reference_2024” [1] "Retrieved category ID: sc_lab_categories__1597" [1] “Retrieved category: LIANA_Receptor_Reference_2024” [1] "Retrieved category ID: sc_lab_categories__1598" [1] “Retrieved category: GO_CYTOKINESIS” [1] "Retrieved category ID: mysigdb_c5_BP__3745" [1] “Retrieved category: GO_PROTEIN_KINASE_ACTIVITY” [1] "Retrieved category ID: mysigdb_c5_MF__169"
## Create Heatmaps ##
###############################################################################
## Reorder Obio@parameterList$HmDisplayCatsFromDb so that 500 var is on top ##
pos <- grep("most_variable_genes", names(HmDisplayCatsFromDb))
if (length(pos) > 0){
pos <- pos[1]
newOrder <- c(
names(HmDisplayCatsFromDb)[pos],
names(HmDisplayCatsFromDb)[-pos]
)
HmDisplayCatsFromDb <- HmDisplayCatsFromDb[newOrder]
}
## ##
###############################################################################
## Begin heatmap plotting loop ##
for (k in 1:length(HmDisplayCatsFromDb)){
## Select samples to display ##
if (!is.null(HmDisplayCatsFromDb[[k]]$comparisonID)){
dfSel <- unique(dfDesign[,c("sample.id", HmDisplayCatsFromDb[[k]]$comparisonID)])
dfSel <- dfSel[dfSel[,HmDisplayCatsFromDb[[k]]$comparisonID] != "",]
if (nrow(dfSel) > 1){
sampleSelection <- paste0("norm_counts_", unique(dfSel$sample.id))
} else {
sampleSelection <- paste0("norm_counts_", unique(dfDesign$sample.id))
}
} else {
sampleSelection <- paste0("norm_counts_", unique(dfDesign$sample.id))
}
## Check ##
sampleSelection <- unique(names(dfMainData)[unlist(sapply(paste0("^", sampleSelection, "$"), function(x) grep(x, names(dfMainData))))])
selVec <- c(geneIDcolumn, sampleSelection )
## Get gene selection
geneSel <- HmDisplayCatsFromDb[[k]]$geneVec
geneSel <- unique(geneSel)
geneSel <- geneSel[geneSel != ""]
if (length(geneSel) > 2){
dfDataTable <- dfMainData
dfDataTable <- unique(dfDataTable[dfDataTable[, geneIDcolumn] %in% geneSel, selVec])
dfHmBase <- unique(dfDataTable[,selVec])
while (sum(duplicated(dfHmBase[, geneIDcolumn])) > 0){
dfHmBase[duplicated(dfHmBase[, geneIDcolumn]), geneIDcolumn] <- paste0(
dfHmBase[duplicated(dfHmBase[, geneIDcolumn]),
geneIDcolumn], "_", i
)
i=i+1
}
row.names(dfHmBase) <- dfHmBase[, geneIDcolumn]
dfHmBase[, geneIDcolumn] <- NULL
## calculate row-means ##
rowMeans <- apply(
dfHmBase,
1,
function(x) mean(x)
)
rowMeans[rowMeans ==0] <- 0.001
hmMax <- 4
for (i in 1:ncol(dfHmBase)){
dfHmBase[,i] <- log2(dfHmBase[,i] / rowMeans)
}
dfHmBase[dfHmBase > hmMax] <- hmMax
dfHmBase[dfHmBase < -1*hmMax] <- -1*hmMax
names(dfHmBase) <- gsub("norm_counts_", "", names(dfHmBase))
names(dfHmBase) <- gsub("_TPM", "", names(dfHmBase))
mHmBase <- data.matrix(dfHmBase)
if ( nrow(mHmBase) < 201){
showRowNames <- TRUE
} else {
showRowNames <- FALSE
}
## Create heatmap plot ##
#library(ComplexHeatmap)
f1 = circlize::colorRamp2(seq(-4, 4, length = 3), c("#3060cf", "#fffbbc","#c4463a"))
anno <- as.data.frame(colnames(mHmBase))
colnames(anno) <- "Sample"
anno$Group <- sapply(as.vector(anno[,1]), function(x) paste0(unlist(strsplit(x, "_"))[1], "_",unlist(strsplit(x, "_"))[2]))
## Color sample groups in line with the designated sample group color ##
#######################################################################
## Add sample group colors if needed
pos <- grep("sample.group_color", names(dfDesign))
if (length(pos) == 0){
sample.group <- unique(dfDesign$sample.group)
sample.group_color <- sample.group
#library(scales)
sample.group_color = scales::hue_pal()(length(sample.group_color))
#sample.group_color = c("#990000", "#009900")
dfGroupColors <- unique(data.frame(sample.group, sample.group_color))
dfDesign <- merge(dfDesign, dfGroupColors, by.x = "sample.group", "sample.group")
if (exists("Obio")){
Obio@dfDesign <- dfDesign
}
}
#library(scales)
#hue_pal()(2)
df <- unique(data.frame(dfDesign[,c("sample.id", "sample.group", "sample.group_color")]))
df <- df[df$sample.id %in% colnames(mHmBase),]
df2 <- data.frame(df[,"sample.group"])
names(df2) <- "Group"
GroupVec <- as.vector(df$sample.group_color)
names(GroupVec) <- as.vector(df$sample.group)
#df2 <- unique(data.frame(Obio@dfDesign[,c("sample.id","sample.group", "sample.group_color")]))
#df2 <- data.frame(df2[,c("sample.group")])
ha = ComplexHeatmap::HeatmapAnnotation(df = df2, col = list(Group = GroupVec))
ComplexHeatmap::ht_opt(
legend_border = "black",
heatmap_border = TRUE,
annotation_border = FALSE
)
hmTitle <- unlist(strsplit(names(HmDisplayCatsFromDb)[k], "_padj_"))
if (length(hmTitle) == 2){
hmTitle <- paste0("padj_", hmTitle[2])
} else {
hmTitle <- names(HmDisplayCatsFromDb)[k]
}
rowFontSize <- 10
if (nrow(mHmBase) > 100){
rowFontSize <- 2
} else if (nrow(mHmBase) > 50){
rowFontSize <- 6
} else if (nrow(mHmBase) > 20){
rowFontSize <- 8
}
HMplotList[[names(HmDisplayCatsFromDb)[k]]] = ComplexHeatmap::Heatmap(
mHmBase,
column_title = gsub(
"_",
" ",
hmTitle
),
name = paste0("HM_", k),
#row_km = 5,
col = f1,
show_column_names = T,
show_row_names = showRowNames,
## row text size
row_names_gp = grid::gpar(fontsize = rowFontSize),
border = TRUE,
#Dendrogram configurations: columns
clustering_distance_columns="euclidean",
clustering_method_columns="complete",
column_dend_height=unit(10,"mm"),
#Dendrogram configurations: rows
clustering_distance_rows="euclidean",
clustering_method_rows="complete",
row_dend_width=unit(10,"mm"),
top_annotation = ha,
show_heatmap_legend = TRUE
#row_title = NULL,
#show_row_dend = FALSE
)
ComplexHeatmap::ht_opt(RESET = TRUE)
if (! is.null(HmDisplayCatsFromDb[[k]]$cat_id)){
link <- paste0(
'An interactive version of this heatmap with an option for further filtering can be found <a href="',
"https://biologic.crick.ac.uk/",
project_id,"/category-view/",
HmDisplayCatsFromDb[[k]]$cat_id,'" target="_blank">here</a>.'
)
} else {
link <- ""
}
###########################################################################
## Save plot to file ##
FNbase <- paste0("Heatmap.", names(HmDisplayCatsFromDb)[k],VersionPdfExt)
FN <- paste0(reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(HMplotList[[names(HmDisplayCatsFromDb)[k]]])
dev.off()
## ##
###########################################################################
figCap <- paste0(
'**Figure ',
figureCount,
':** Heatmap showing the gene category ', gsub('_', ' ', names(HmDisplayCatsFromDb)[k]), '. ',
'Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>. ',
link
)
figureCount <- figureCount + 1
NewChnk <- paste0(
"## HM_", names(HmDisplayCatsFromDb)[k],
"\n```{r, results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(HMplotList[['",names(HmDisplayCatsFromDb)[k],"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
} ## End making heatmap
## Done making heatmaps ##
###############################################################################
}
## End heatmap plotting loop
## Done ##
###############################################################################
if (length(HMplotList) > 2){
tabVar <- ".tabset .tabset-fade .tabset-dropdown"
} else {
tabVar <- ".tabset .tabset-fade .tabset-pills"
}
Figure 15: Heatmap showing the gene category TFs in most variable genes. Download a pdf of this figure here.
Figure 16: Heatmap showing the gene category Ligands in most variable genes. Download a pdf of this figure here.
Figure 17: Heatmap showing the gene category Receptors in most variable genes. Download a pdf of this figure here.
Figure 18: Heatmap showing the gene category GO Kinases. Download a pdf of this figure here.
## Make heatmap gene list
logFCselections <- names(dfMainData)[grep("_logFC_", names(dfMainData))]
padjSelections <- gsub("_logFC_", "_padj_", logFCselections)
dfSelections <- data.frame(logFCselections, padjSelections)
dfSelections <- dfSelections[dfSelections[,"padjSelections"] %in% names(dfMainData),]
if (!exists("project_id")){
project_id <- gsub("_designTable", "", designTB)
}
if (!exists("labname")){
labname <- "TBD"
}
if (!exists("reportFigDir") || is.null(reportFigDir)){
reportFigDir <- ""
}
if (!exists("VersionPdfExt")){
VersionPdfExt <- paste0(".V", gsub("-", "", Sys.Date()), ".pdf")
}
if (nrow(dfSelections) > 2){
tabVar <- ".tabset .tabset-fade .tabset-dropdown"
} else {
tabVar <- ".tabset .tabset-fade .tabset-pills"
}
if (!exists("sdCutOff")){
sdCutOff <- 2
}
chnkVec <- as.vector(NULL, mode = "character")
MAplotList <- list()
VplotList <- list()
## Create dfMAplots ##
contrastSel <- c(
names(dfMainData)[grep("contrast_[0-9]{1,2}", names(dfMainData))],
names(dfMainData)[grep("contrast_D[0-9]{1,2}", names(dfMainData))]
)
MAselVec <- c(
contrastSel[grep("lg2BaseMean", contrastSel)],
contrastSel[grep("logFC", contrastSel)]
)
VolcanoSelVec <- c(
contrastSel[grep("logFC", contrastSel)],
contrastSel[grep("lg10p", contrastSel)]
)
contrastVec <- as.vector(sapply(
contrastSel[grep("logFC", contrastSel)],
function(x) unlist(strsplit(x, "logFC_"))[2]
))
###############################################################################
## Make MA plot function ##
makeMAplot <- function(
dfPlotData,
geneIDcolumn,
topNgenes = 5,
dotsize = 1,
legendDotSize = 5,
sdCutOff = 1
){
headline <- names(dfPlotData)[grep("logFC", names(dfPlotData))]
headline <- unlist(strsplit(headline, "logFC_"))[2]
names(dfPlotData) <- gsub("contrast_[0-9]{1,2}_", "", names(dfPlotData))
logFCcolName <- names(dfPlotData)[grep("logFC", names(dfPlotData))]
padjColName <- names(dfPlotData)[grep("padj", names(dfPlotData))]
lg2BaseMeanColName <- names(dfPlotData)[grep("lg2BaseMean", names(dfPlotData))]
## Now let's get these data columns out of the main data table.
dfPlotData <- dfPlotData[dfPlotData[,lg2BaseMeanColName] > 0, ]
## For plotting we are using the R-package ggplot. This is a widely used, comprehensive package to make beautiful plots. More information on that here: https://ggplot2.tidyverse.org/
library(ggplot2)
## Let's add an example for custom coloring here. We are going to highlight the most variable genes in this scatterplot. To do that, we need to add a color column to the plot data dataframe.
## Now let's color by significantly up-regulated genes in red, and significantly downregulated genes in blue
dfPlotData[["color"]] <- "NS"
dfPlotData[dfPlotData[, logFCcolName] > 0 & dfPlotData[, padjColName] < 0.05, "color"] <- "Up"
dfPlotData[dfPlotData[, logFCcolName] < 0 & dfPlotData[, padjColName] < 0.05, "color"] <- "Down"
## Re-order dfPlotData for better results
## Let's have a look at the color vector
colorVec <- c("blue", "red","black")
names(colorVec) <- c("Down", "Up", "NS")
## And here is the resulting color vector
colorVec <- colorVec[names(colorVec) %in% dfPlotData$color]
dfPlotData$color <- factor(dfPlotData$color, levels = names(colorVec))
dfPlotData <- dfPlotData[order(dfPlotData$color, decreasing = F), ]
## Now let's also add a label for the 10 most significantly up- and down-regulated genes.This number can be changed in the variable Nsel. Here we use the R package ggrepel.
library(ggrepel)
## Let's order the data frame by log-fold change
dfPlotData <- dfPlotData[order(dfPlotData[,logFCcolName], decreasing = T), ]
topGenes <- as.vector(dfPlotData[1:topNgenes,geneIDcolumn])
dfPlotData <- dfPlotData[order(dfPlotData[,logFCcolName], decreasing = F), ]
bottomGenes <- as.vector(dfPlotData[1:topNgenes,geneIDcolumn])
dfPlotData[["label"]] <- ""
dfPlotData[dfPlotData[,geneIDcolumn] %in% c(topGenes, bottomGenes), "label"] <- dfPlotData[dfPlotData[,geneIDcolumn] %in% c(topGenes, bottomGenes), geneIDcolumn]
yScaleMax <- max(abs(dfPlotData[,logFCcolName]))
lgFCsel <- sdCutOff * sd(dfPlotData[, logFCcolName])
## Now let's first make the MA-plot without lables
plotNoLabels <- ggplot(
data = dfPlotData,
aes_string(x=lg2BaseMeanColName, y=logFCcolName, color = "color", label = "label")
) + geom_hline(yintercept = 0, color = "black", size=0.5
) + geom_hline(yintercept = c(-1*lgFCsel,lgFCsel), color = "grey", size=0.5, linetype = 2
) + geom_point( shape=16, size = dotsize
) + scale_colour_manual(name = "Significant" ,values = colorVec
) + theme_bw(
) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12)
) + ylim(-1*yScaleMax, yScaleMax
) + ggtitle(paste0("MA-Plot ", contrastVec[i])
) + xlab(gsub("_", " ", logFCcolName)
) + ylab(gsub("_", " ", logFCcolName)
) + guides(col = guide_legend(override.aes = list(shape = 16, size = legendDotSize))
)
## And now let's add the labels:
plotWithLabels <- plotNoLabels + geom_text_repel(size = 3)
return(plotWithLabels)
}
## End Make MA plot function ##
###############################################################################
###############################################################################
## Make Volcanoplot ##
makeVolcanoPlot <- function(
dfPlotData,
geneIDcolumn,
topNgenes = 5,
dotsize = 1,
legendDotSize = 5,
sdCutOff = 1
){
headline <- names(dfPlotData)[grep("logFC", names(dfPlotData))]
headline <- unlist(strsplit(headline, "logFC_"))[2]
names(dfPlotData) <- gsub("contrast_[0-9]{1,2}_", "", names(dfPlotData))
logFCcolName <- names(dfPlotData)[grep("logFC", names(dfPlotData))]
lg10pColName <- names(dfPlotData)[grep("lg10p", names(dfPlotData))]
padjColName <- names(dfPlotData)[grep("padj", names(dfPlotData))]
## Now let's get these data columns out of the main data table.
dfPlotData <- dfPlotData[dfPlotData[,logFCcolName] != 0, ]
## Determine logFC cut-off for the Volcano Plot ##
lgFCsel <- sdCutOff * sd(dfPlotData[, logFCcolName])
dfPlotData[["color"]] <- "NS"
dfPlotData[dfPlotData[, logFCcolName] > lgFCsel & dfPlotData[, padjColName] < 0.05, "color"] <- "Up"
dfPlotData[dfPlotData[, logFCcolName] < -1*lgFCsel & dfPlotData[, padjColName] < 0.05, "color"] <- "Down"
## Re-order dfPlotData for better results
## Let's have a look at the color vector
colorVec <- c("blue", "red","black")
names(colorVec) <- c("Down", "Up", "NS")
## And here is the resulting color vector
colorVec <- colorVec[names(colorVec) %in% dfPlotData$color]
dfPlotData$color <- factor(dfPlotData$color, levels = names(colorVec))
dfPlotData <- dfPlotData[order(dfPlotData$color, decreasing = F), ]
## And here is the resulting color vector
colorVec <- colorVec[names(colorVec) %in% dfPlotData$color]
dfPlotData$color <- factor(dfPlotData$color, levels = names(colorVec))
dfPlotData <- dfPlotData[order(dfPlotData$color, decreasing = F), ]
## Now let's also add a label for the 10 most significantly up- and down-regulated genes.This number can be changed in the variable Nsel. Here we use the R package ggrepel.
library(ggrepel)
## Let's order the data frame by log-fold change
dfPlotData <- dfPlotData[order(dfPlotData[,logFCcolName], decreasing = T), ]
topGenes <- as.vector(dfPlotData[1:topNgenes,geneIDcolumn])
dfPlotData <- dfPlotData[order(dfPlotData[,logFCcolName], decreasing = F), ]
bottomGenes <- as.vector(dfPlotData[1:topNgenes,geneIDcolumn])
dfPlotData[["label"]] <- ""
dfPlotData[dfPlotData[,geneIDcolumn] %in% c(topGenes, bottomGenes), "label"] <- dfPlotData[dfPlotData[,geneIDcolumn] %in% c(topGenes, bottomGenes), geneIDcolumn]
xMaxVal <- max(abs(dfPlotData[,logFCcolName]))
pVolcano <- ggplot(
data = dfPlotData,
aes_string(x=logFCcolName, y=lg10pColName, color = "color",label = "label")
) + geom_hline(yintercept = 0, color = "black", size=0.5
) + geom_hline(yintercept = -1*log10(0.05), color = "grey", size=0.5, linetype = 2
) + geom_vline(xintercept = 0, color = "black", size=0.5
) + geom_vline(xintercept = c(-1*lgFCsel,lgFCsel), color = "grey", size=0.5, linetype = 2 ) + geom_point( shape=16, size = dotsize
) + scale_colour_manual(name = "Variability" ,values = colorVec
) + theme_bw(
) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12)
) + xlim(-1*xMaxVal,xMaxVal
) + ggtitle(paste0("Volcano Plot ", contrastVec[i])
) + xlab(gsub("_", " ", logFCcolName)
) + ylab(gsub("_", " ", lg10pColName)
) + guides(col = guide_legend(override.aes = list(shape = 16, size = legendDotSize))
)
plotVolcanoWithLabels <- pVolcano + geom_text_repel(size = 3)
return(plotVolcanoWithLabels)
}
## Done Volcanoplot ##
###############################################################################
for (i in 1:length(contrastVec)){
## Make MA-plot ##
contrastVec <- as.vector(sapply(
contrastSel[grep("logFC", contrastSel)],
function(x) unlist(strsplit(x, "logFC_"))[2]
))
selVec <- c(
geneIDcolumn,
names(dfMainData)[grep(paste0("lg2BaseMean_", contrastVec[i], "$"), names(dfMainData))],
names(dfMainData)[grep(paste0("logFC_", contrastVec[i], "$"), names(dfMainData))],
names(dfMainData)[grep(paste0("padj_", contrastVec[i], "$"), names(dfMainData))],
names(dfMainData)[grep(paste0("lg10p_", contrastVec[i], "$"), names(dfMainData))]
)
dfPlotData <- unique(dfMainData[,selVec])
tagMA <- paste0("MA_", contrastVec[i])
MAplotList[[tagMA]] <- makeMAplot(
dfPlotData = dfPlotData,
geneIDcolumn = geneIDcolumn,
topNgenes = 5,
dotsize = 1,
legendDotSize = 5,
sdCutOff = sdCutOff
)
###########################################################################
## Save plot to file ##
FNbase <- paste0(contrastVec[i], ".MA.plot", VersionPdfExt)
FN <- paste0(reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(MAplotList[[tagMA]])
dev.off()
## ##
###########################################################################
selLg2BM <- selVec[grep("lg2BaseMean_", names(dfPlotData))]
selLogFC <- selVec[grep("_logFC_", names(dfPlotData))]
xAxis <- selLg2BM[grep(contrastVec[i], selLg2BM)]
yAxis <- selLogFC[grep(contrastVec[i], selLogFC)]
link1 <- paste0('<a href="https://biologic.crick.ac.uk/',project_id,'/scatterplot?x_axis=',xAxis,'&y_axis=',yAxis,'&cat_id=ag_lab_categories__10" target="_blank">here</a>.')
figCap <- paste0(
'**Figure ',
figureCount,
'A:** Volcano and MA-plot Plot ',gsub('MA_', '', tagMA),'. ',
'Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>. ',
'An interactive version of this plot can be found ', link1
)
figureCount <- figureCount + 1
NewChnk <- paste0(
"## MA-Plot ",contrastVec[i],
"\n```{r ",contrastVec[i],", results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(MAplotList[['",tagMA,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
## Now the Volcano Plot ##
tagV <- paste0("Volcano_", contrastVec[i])
VplotList[[tagV]] <- makeVolcanoPlot(
dfPlotData,
geneIDcolumn,
topNgenes = 5,
dotsize = 1,
legendDotSize = 5,
sdCutOff = sdCutOff
)
###########################################################################
## Save plot to file ##
FNbase <- paste0(contrastVec[i], ".Volcano.plot", VersionPdfExt)
FN <- paste0(reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(VplotList[[tagV]])
dev.off()
## ##
###########################################################################
selLg10p <- selVec[grep("_lg10p_", names(dfPlotData))]
selLogFC <- selVec[grep("_logFC_", names(dfPlotData))]
xAxis <- selLogFC[grep(contrastVec[i], selLogFC)]
yAxis <- selLg10p[grep(contrastVec[i], selLg10p)]
link2 <- paste0('<a href="https://biologic.crick.ac.uk/',project_id,'/scatterplot?x_axis=',xAxis,'&y_axis=',yAxis,'&cat_id=ag_lab_categories__10" target="_blank">here</a>.')
figCap <- paste0(
'**Figure ',
figureCount,
'B:** Volcanoplot ',contrastVec[i],'. ',
'Download a pdf of this figure <a href="',FNrel,'" target = "_blank">here</a>. ',
'An interactive version of this plot can be found ' , link2, '.'
)
figureCount <- figureCount + 1
NewChnk <- paste0(
"\n```{r V_",contrastVec[i],", results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(VplotList[['",tagV,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
}
if (length(contrastVec) > 2){
tabVar <- ".tabset .tabset-fade .tabset-dropdown"
} else {
tabVar <- ".tabset .tabset-fade .tabset-pills"
}
Figure 29A: Volcano and MA-plot Plot panTumor_vs_panNormal. Download a pdf of this figure here. An interactive version of this plot can be found here.
Figure 30B: Volcanoplot panTumor_vs_panNormal. Download a pdf of this figure here. An interactive version of this plot can be found here..
if (!exists("sdCutOff")){
sdCutOff <- 2
}
## Create enriched genes list ##
EnrichedGenesList <- list()
contrastSel <- c(
names(dfMainData)[grep("contrast_[0-9]{1,2}", names(dfMainData))],
names(dfMainData)[grep("contrast_D[0-9]{1,2}", names(dfMainData))]
)
DGEtagVec <- as.vector(sapply(
contrastSel[grep("logFC", contrastSel)],
function(x) unlist(strsplit(x, "logFC_"))[2]
))
selVec <- c(
geneIDcolumn,
contrastSel
)
dfAllPlots <- dfMainData[,selVec]
if (geneIDcolumn != "mgi_symbol" & geneIDcolumn != "hgnc_symbol") {
queryGS <- "hgnc_symbol"
} else {
queryGS <- Obio@parameterList$geneIDcolumn
}
for (i in 1:length(DGEtagVec)){
tag <- paste0("Enrichments_HG_", DGEtagVec[i])
tagGLpos <- paste0(DGEtagVec[i], "_pos")
tagGLneg <- paste0(DGEtagVec[i], "_neg")
selVec <- c(
geneIDcolumn,
names(dfAllPlots)[grep(paste0("lg2BaseMean_", DGEtagVec[i],"$"), names(dfAllPlots))],
names(dfAllPlots)[grep(paste0("logFC_",DGEtagVec[i],"$"), names(dfAllPlots))],
names(dfAllPlots)[grep(paste0("padj_",DGEtagVec[i],"$"), names(dfAllPlots))],
names(dfAllPlots)[grep(paste0("lg10p_",DGEtagVec[i],"$"), names(dfAllPlots))]
)
dfPlot <- dfAllPlots[,selVec]
pos <- grep("included", names(dfPlot))
if (length(pos) == 0){
dfPlot[["included"]] <- "+"
}
lgFCsel <- sdCutOff * sd(dfPlot[,grep("_logFC_", names(dfPlot))])
dfPlot[["DGE_Status"]] <- "Unchanged"
dfPlot[dfPlot[,grep("_logFC_", names(dfPlot))] > lgFCsel & dfPlot[,grep("_padj_", names(dfPlot))] < 0.05, "DGE_Status"] <- "Up"
EnrichedGenesList[[tagGLpos]] <- unique(dfPlot[dfPlot$DGE_Status == "Up", geneIDcolumn])
dfPlot[dfPlot[,grep("_logFC_", names(dfPlot))] < -1 *lgFCsel & dfPlot[,grep("_padj_", names(dfPlot))] < 0.05, "DGE_Status"] <- "Down"
EnrichedGenesList[[tagGLneg]] <- unique(dfPlot[dfPlot$DGE_Status == "Down", geneIDcolumn])
print(i)
}
library(knitr)
library(ggplot2)
#save.image("temp.RData")
library(clusterProfiler)
gmtList <- list()
dbtableList <- list(
# "GO-MF" = "mysigdb_c5_MF",
"Pathways" = "mysigdb_c2_1329_canonical_pathways",
"HallMarks" = "mysigdb_h_hallmarks"
)
for (i in 1:length(dbtableList)){
dfTemp <- unique(import.db.table.from.db(
host = Obio@dbDetailList$host,
dbname = Obio@dbDetailList$ref.cat.db,
dbtable = dbtableList[[i]],
password = db.pwd,
user = Obio@dbDetailList$db.user
))
## Remove duplicated entries ##
dfTemp <- dfTemp[!(duplicated(dfTemp$cat_name)),]
rmVec <- grep("temp_", dfTemp$cat_type)
if (length(rmVec) > 0){
dfTemp <- dfTemp[-rmVec, ]
}
dfTemp <- unique(dbcat2gmt(
df.cat = dfTemp, # As downloaded from reference_categories_db_new database
gene.id.column = queryGS
))
dfTemp <- unique(dfTemp[!duplicated(as.vector(dfTemp[,1])), ])
write.table(
dfTemp,
"temp.gmt.txt",
row.names = F,
sep = "\t",
col.names = F,
quote = F
)
CPgmt <- read.gmt("temp.gmt.txt")
unlink("temp.gmt.txt")
CPgmt <- unique(CPgmt[CPgmt$gene != "", ])
gmtList[[dbtableList[[i]]]] <- CPgmt
}
## Edit collection names for plot
names(gmtList) <- gsub("mysigdb_h_hallmarks", "HallMarkCats",names(gmtList))
names(gmtList) <- gsub("mysigdb_", "",names(gmtList))
names(gmtList) <- gsub("c2_1329_canonical_p", "P",names(gmtList))
names(gmtList) <- gsub("sc_sig", "CellSig",names(gmtList))
names(gmtList) <- gsub("cibersort_L22", "CellSig",names(gmtList))
names(gmtList) <- gsub("c5_", "GO_",names(gmtList))
names(gmtList) <- gsub("networkcategories", "Complexes",names(gmtList))
## Done creating gmt list
###########################
## Select colors ##
library(scales)
enrCols <- hue_pal()(length(gmtList))
names(enrCols) <- substr(names(gmtList),1,10)
plotList <- list()
chnkVec <- as.vector(NULL, mode = "character")
for (j in 1:length(DGEtagVec)){
posTestGeneSet <- as.vector(
unique(
EnrichedGenesList[[paste0(DGEtagVec[j], "_pos")]]
)
)
negTestGeneSet <- as.vector(
unique(
EnrichedGenesList[[paste0(DGEtagVec[j], "_neg")]]
)
)
###########################################################################
## Create GMT file for category enrichment ##
###########################
## Create gmt list
## Retrieve gmt files from database
## Add custom gmt files
## Done ##
###########################################################################
library(clusterProfiler)
library(ggplot2)
library(tidyr)
if (geneIDcolumn != "mgi_symbol" & geneIDcolumn != "hgnc_symbol") {
queryGS <- "hgnc_symbol"
} else {
queryGS <- geneIDcolumn
}
if (Obio@dbDetailList$host == "10.27.241.234"){
urlString <- "biologic.thecrick.org"
} else {
urlString <- "biologic.crick.ac.uk"
}
colVec <- c("red", "blue")
pvalueCutoff <- 0.5
topMaxCat <- 10
## Get background gene set ##
#backgroundGeneVec <- row.names(OsC[["RNA"]]@counts)
if ((length(posTestGeneSet) >= 3) | (length(negTestGeneSet) >= 3)){
## Do enrichment ##
first <- TRUE
if (length(posTestGeneSet) >= 3){
for (k in 1:length(gmtList)){
egmt <- data.frame(
enricher(
negTestGeneSet,
TERM2GENE=gmtList[[k]],
pvalueCutoff = pvalueCutoff
)
)
if (!is.null(egmt)){
if (nrow(egmt) > 0){
egmt[["Collection"]] <- substr(names(gmtList)[k], 1,10)
}
if (first){
dfTempEnriched <- egmt
first <- FALSE
} else {
dfTempEnriched <- rbind(
dfTempEnriched,
egmt
)
}
}
}
if (nrow(dfTempEnriched) > 0){
dfTempEnriched[["direction"]] <- "positive"
dfTempEnriched[["log10FDR"]] <- log10(dfTempEnriched$p.adjust)
dfTempEnriched <- dfTempEnriched[order(dfTempEnriched$log10FDR, decreasing = F),]
dfTempEnriched <- na.omit(dfTempEnriched)
if (nrow(dfTempEnriched) > topMaxCat){
dfTempEnriched <- dfTempEnriched[1:topMaxCat, ]
}
}
} # end positive
## Now the negative side ##
if (length(negTestGeneSet) >= 3){
first <- TRUE
for (k in 1:length(gmtList)){
egmt <- data.frame(
enricher(
posTestGeneSet,
TERM2GENE=gmtList[[k]],
pvalueCutoff = pvalueCutoff
)
)
if (!is.null(egmt)){
if (nrow(egmt) > 0){
egmt[["Collection"]] <- substr(names(gmtList)[k], 1,10)
}
if (first){
dfTempEnrichedNeg <- egmt
first <- FALSE
} else {
dfTempEnrichedNeg <- rbind(
dfTempEnrichedNeg,
egmt
)
}
}
}
if (nrow(dfTempEnrichedNeg) > 0){
dfTempEnrichedNeg[["direction"]] <- "negative"
dfTempEnrichedNeg[["log10FDR"]] <- -1*log10(dfTempEnrichedNeg$p.adjust)
dfTempEnrichedNeg <- dfTempEnrichedNeg[order(dfTempEnrichedNeg$log10FDR, decreasing = T),]
dfTempEnrichedNeg <- na.omit(dfTempEnrichedNeg)
if (nrow(dfTempEnrichedNeg) > topMaxCat){
dfTempEnrichedNeg <- dfTempEnrichedNeg[1:topMaxCat, ]
}
}
} # end negative
## Make plot
if (!exists("dfTempEnriched")){
dfTempEnriched <- data.frame(
ID = character(0),
Term = character(0),
p.adjust = numeric(0),
pvalue = numeric(0),
qvalue = numeric(0),
geneID = character(0),
Count = numeric(0),
Collection = character(0),
direction = character(0),
log10FDR = numeric(0)
)
}
if (!exists("dfTempEnrichedNeg")){
dfTempEnrichedNeg <- data.frame(
ID = character(0),
Term = character(0),
p.adjust = numeric(0),
pvalue = numeric(0),
qvalue = numeric(0),
geneID = character(0),
Count = numeric(0),
Collection = character(0),
direction = character(0),
log10FDR = numeric(0)
)
}
if ((nrow(dfTempEnriched) > 0) | (nrow(dfTempEnrichedNeg) > 0)){
dfSel <- rbind(
dfTempEnriched,
dfTempEnrichedNeg
)
dfSel <- na.omit(dfSel)
dfSel <- dfSel[order(dfSel$log10FDR),]
dfSel$log10FDR <- round(dfSel$log10FDR, 2)
dfSel[["Category"]] <- ""
dfSel[dfSel$log10FDR >= 0, "Category"] <- "Enr."
dfSel[dfSel$log10FDR < 0, "Category"] <- "Depl."
for (l in 1:nrow(dfSel)){
if (nchar(dfSel[l, "ID"]) > 30){
part1 <- substr(dfSel[l, "ID"], 1, 30)
part2 <- substr(dfSel[l, "ID"], 31, 60)
dfSel[l, "ID"] <- paste0(part1, " \\n", part2)
}
}
#dfSel$Term <- gsub("\\(GO", "\\\n\\(GO", dfSel$Term)
dfSel$ID <- factor(dfSel$ID, levels = unique(dfSel$ID))
plotList[[paste0("PCA_ENR_", j)]] <- ggplot(
data=dfSel, aes(x= ID, y=log10FDR, fill=Collection, order=log10FDR)
) + geom_bar(stat="identity", colour="black"
) + coord_flip() +scale_fill_manual(values=enrCols
) + theme_bw(
) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12)
) + labs(title = paste0("Comparison ", DGEtagVec[j]," enriched genes") ,y = "-log10(FDR)", x = ""
) + geom_hline(yintercept = c(-log10(0.05), log10(0.05)), color = "grey", size=0.5, lty=2
) + geom_hline(yintercept = 0, color = "black", size=0.5
)
cat(" \n")
## Save to file ##
FNbase <- paste0("DGE_comparison_", DGEtagVec[j],".enriched.genes", VersionPdfExt)
FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(plotList[[paste0("PCA_ENR_", j)]])
dev.off()
link <- paste0(
'<a href="https://', urlString, '/',
Obio@parameterList$project_id,
'/category-view?category_type=GO-BP" target="_blank">CategoryView</a>'
)
## Create R markdown chunk ##
figLegend <- paste0(
'**Figure ',
figureCount,
'**: Category enrichment analysis for the top genes that have <font color = "',colVec[2],'"> the most positive </font> and <font color = "',colVec[1],'">the most negative</font> PCA loading values in dimension ',
DGEtagVec[j],
' associated with them. Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>. To view these gene sets in the context of your data, go to ',link,' and find these categories using the search box.'
)
figureCount <- figureCount + 1
NewChnk <- paste0(
"## ", DGEtagVec[j],
"\n```{r enrichr_",
j,", results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",
figLegend,"'}\n",
"\n",
"\n print(plotList[['",paste0("PCA_ENR_", j),"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
}
## done with plot
} ## Done with per dimension loops
}
if (length(plotList) > 3){
tabVar <- ".tabset .tabset-fade .tabset-dropdown"
} else {
tabVar <- ".tabset .tabset-fade .tabset-pills"
}
###############################################################################
## Do category enrichment on clusters ##
cat(paste(knit(text = chnkVec, quiet = T), collapse = '\n'))
Figure 31: Category enrichment analysis for the top genes that have the most positive and the most negative PCA loading values in dimension Colon_T_vs_Colon_N associated with them. Download a pdf of this figure here. To view these gene sets in the context of your data, go to CategoryView and find these categories using the search box.
Figure 32: Category enrichment analysis for the top genes that have the most positive and the most negative PCA loading values in dimension Kidney_T_vs_Kidney_N associated with them. Download a pdf of this figure here. To view these gene sets in the context of your data, go to CategoryView and find these categories using the search box.
Figure 33: Category enrichment analysis for the top genes that have the most positive and the most negative PCA loading values in dimension Liver_T_vs_Liver_N associated with them. Download a pdf of this figure here. To view these gene sets in the context of your data, go to CategoryView and find these categories using the search box.
Figure 34: Category enrichment analysis for the top genes that have the most positive and the most negative PCA loading values in dimension Lung_T_vs_Lung_N associated with them. Download a pdf of this figure here. To view these gene sets in the context of your data, go to CategoryView and find these categories using the search box.
Figure 35: Category enrichment analysis for the top genes that have the most positive and the most negative PCA loading values in dimension Skin_T_vs_Skin_N associated with them. Download a pdf of this figure here. To view these gene sets in the context of your data, go to CategoryView and find these categories using the search box.
Figure 36: Category enrichment analysis for the top genes that have the most positive and the most negative PCA loading values in dimension panTumor_vs_panNormal associated with them. Download a pdf of this figure here. To view these gene sets in the context of your data, go to CategoryView and find these categories using the search box.
## Done doing enrichment on clusters ##
###############################################################################
if (!exists("project_id")){
project_id <- gsub("_designTable", "", designTB)
}
if (!exists("labname")){
labname <- "TBD"
}
if (!exists("reportFigDir") || is.null(reportFigDir)){
reportFigDir <- ""
}
if (!exists("reportTableDir") || is.null(reportTableDir)){
reportTableDir <- ""
}
if (!exists("VersionPdfExt")){
VersionPdfExt <- paste0(".V", gsub("-", "", Sys.Date()), ".pdf")
}
Find furter background information on the Gene Set Enrichment Analysis (GSEA) and the interpretation of results can be found here. Here the improved fgsea algorithm will be used to calculate enrichment scores.
if (!exists("VersionPdfExt")){
VersionPdfExt <- paste0(".V", gsub("-", "", Sys.Date()), ".pdf")
}
if (!exists("Obio") || is.null(Obio@parameterList$workdir)){
workdir <- getwd()
}
chnkVec <- as.vector(NULL, mode = "character")
plotList <- list()
plotListER <- list()
chnkVecER <- list()
###############################################################################
## Prepare GMT file ##
## Option A - from database ##
if (exists("Obio") && !is.null(Obio@parameterList$GSEAtables)){
tables <- Obio@parameterList$GSEAtables
print(
paste0(
"The following gene sets have been used in the GSEA analysis: ",
sort(paste0(names(Obio@referenceTableList), collapse = ",")),
"."
)
)
} else {
tables <- c(
"mysigdb_h_hallmarks",
"mysigdb_c5_BP" #,
#Obio@parameterList$lab.categories.table
)
}
# #
dfRefGmt <- create.gmt.file.from.ref.data.table(
host = Obio@dbDetailList$host,
dbname = "reference_categories_db_new",
dataTable = tables,
pwd = db.pwd,
user=Obio@dbDetailList$db.user,
gene.id.column = "hgnc_symbol"
)
localGmtDir <- paste0(
Obio@parameterList$workdir,
"GSEA/"
)
if (!exists(localGmtDir)){
dir.create(localGmtDir)
}
#
gmtDir<- paste0(
Obio@parameterList$workdir,
"GSEA/"
)
gmtFileName <- paste0(
project_id,
".",
"projectGmtFile.gmt"
)
dfRefGmt <- dfRefGmt[!(duplicated(dfRefGmt[,1])),]
dfPathwayAnno <- unique(data.frame(cat_id = row.names(dfRefGmt), cat_name = dfRefGmt[,1]), url=dfRefGmt[,2])
dfRefGmt[,2] <- NULL
## transform all columns
empty_as_na <- function(x){
if("factor" %in% class(x)) x <- as.character(x) ## since ifelse wont work with factors
ifelse(as.character(x)!="", x, NA)
}
dfRefGmt <- dfRefGmt %>% dplyr::mutate_each(dplyr::funs(empty_as_na))
write.table(
dfRefGmt,
paste0(localGmtDir, gmtFileName),
col.names = FALSE,
row.names = FALSE,
sep="\t",
quote = F
)
## Done creating project gmt. file ##
###############################################################################
## Option B: Load a gmt file created by other means
# FN <- "/Volumes/babs/working/boeings/Projects/goulda/adrien.franchet/472_brains_from_drosophila_larvae_RN21220/workdir/GSEA/RN21220.projectGmtFile.gmt"
#
# dfRefGmt <- read.delim(
# FN,
# header = F,
# sep = "\t",
# stringsAsFactors = F
# )
###############################################################################
## Run fGSEA on all log-fold changes ##
selVec <- c(
"hgnc_symbol",
names(dfMainData)[grep(paste0("contrast_[0-9]{1,2}_logFC"), names(dfMainData))],
names(dfMainData)[grep(paste0("contrast_D[0-9]{1,2}_logFC"), names(dfMainData))]
)
dfGSEAdata <- unique(dfMainData[, selVec])
dfGSEAdata <- na.omit(dfGSEAdata)
if (ncol(dfGSEAdata) > 2){
dfGSEAdata <- dfGSEAdata[rowSums(dfGSEAdata[,2:ncol(dfGSEAdata)]) != 0,]
} else {
dfGSEAdata <- dfGSEAdata[dfGSEAdata[,2] != 0,]
}
## Delete old rnk files ##
if (!exists("localGmtDir")){
localGmtDir <- "GSEA/"
}
unlink(paste0(localGmtDir, "*.rnk"))
biologicSeqTools2::create.gsea.rnk.files(
workdir,
df.dataTable = dfGSEAdata,
GSEA.colum.type = "logFC",
gene.symbol.column.name = "hgnc_symbol",
GSEADir = localGmtDir
)
rnkFileVec <- paste0(localGmtDir,list.files(localGmtDir)[grep(".rnk$", list.files(localGmtDir))])
plotList <- list()
chnkVec <- as.vector(NULL, mode="character")
## Create Excel output ##
fullOutFN <- paste0(project_id, "_GSEA.xlsx")
outFN <- paste0(project_id, "_GSEA.xlsx")
wb <- openxlsx::createWorkbook()
for (i in 1:length(rnkFileVec)){
logFCcol <- unlist(strsplit(rnkFileVec[i], "GSEA/"))[2]
logFCcol <- gsub(".rnk", "",logFCcol)
lg10pCol <- gsub("logFC","lg10p", logFCcol)
lg2BaseMeanCol <- gsub("logFC", "lg2BaseMean", logFCcol)
tag <- gsub("contrast_[0-9]{1,2}_", "", logFCcol)
tag <- paste0("GSEA_", tag)
dfRnk <- read.delim(
rnkFileVec[i],
header=T,
sep = "\t"
)
GSEAranks <- dfRnk$logFC
names(GSEAranks) <- dfRnk$hgnc_symbol
pathways <- fgsea::gmtPathways(paste0(localGmtDir, gmtFileName))
set.seed(42)
fgseaRes <- fgsea::fgsea(
pathways = pathways,
stats = GSEAranks,
minSize = 10,
maxSize = 2500
)
###########################################################################
## Make top-bottom 10 plot ##
N <- 10
## Top N up NES ##
topNup <- as.vector(unlist(fgseaRes[order(fgseaRes$NES, decreasing = T),"pathway"]))[1:N]
## Top N down NES
topNdown <- as.vector(unlist(fgseaRes[order(fgseaRes$NES, decreasing = F),"pathway"]))[1:N]
topPathways <- c(topNup,rev(topNdown))
## Necessary to load fgsea to get the gridExtra package loaded ##
dfTable <- fgseaRes
dfTable$pathway <- substr(gsub("_", " ", dfTable$pathway),1,60)
library(fgsea)
pdf("temp.pdf")
plotList[[tag]] <- plotGseaTable(
pathways = pathways[topPathways],
stats = GSEAranks,
fgseaRes = dfTable,
gseaParam=0.5,
colwidths = c(5, 3, 0, 0, 0),
render = FALSE
)
dev.off()
unlink("temp.pdf")
###########################################################################
## Save plot to file ##
FNbase <- paste0("GSEAsummary.", tag,VersionPdfExt)
FN <- paste0(reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(grid::grid.draw(plotList[[tag]]))
dev.off()
## ##
###########################################################################
figCap <- paste0(
'**Figure ',
figureCount,
':** Top up- and downregulated GSEA gene categories for the log-FC comparison ', tag, '. ',
'Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>. '
)
figureCount <- figureCount + 1
NewChnk <- paste0(
"### ", tag,
"\n```{r ",tag,", results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(grid::grid.draw(plotList[['",tag,"']]))",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
## Done ##
###########################################################################
###########################################################################
## Make category plots
## stump ##
# library(fgsea)
# plotListER[[tag]] <- plotEnrichment(pathways[[topNup[1]]],
# exampleRanks) + labs(title=gsub("_", " ", topNup[1]))
##
###########################################################################
fgseaRes <- unique(fgseaRes[,c("pathway", "NES", "padj")])
#fgseaRes <- fgseaRes[fgseaRes$padj < 0.05,]
dfTempAnno <- dfPathwayAnno[dfPathwayAnno$cat_name %in% fgseaRes$pathway,]
fgseaRes <- merge(
dfTempAnno,
fgseaRes,
by.x = "cat_name",
by.y = "pathway"
)
fgseaRes[["GSEA"]] <- tag
fgseaRes[["Volcanoplot_Link"]] <- paste0(
'https://biologic.crick.ac.uk/',
Obio@parameterList$project_id,
'/scatterplot?x_axis=',
logFCcol,
'&y_axis=',
lg10pCol,
'&cat_id=',
fgseaRes$cat_id
)
fgseaRes[["MAplot_Link"]] <- paste0(
'https://biologic.crick.ac.uk/',
Obio@parameterList$project_id,
'/scatterplot?x_axis=',
lg2BaseMeanCol,
'&y_axis=',
logFCcol,
'&cat_id=',
fgseaRes$cat_id
)
###############################################################################
## Add scatterplot and heatmap urls ##
fgseaRes[["Heatmap_Link"]] <- paste0('https://biologic.crick.ac.uk/', Obio@parameterList$project_id, '/category-view/', fgseaRes$cat_id)
## Done ##
###############################################################################
###########################################################################
## Save plot to file ##
#library(openxlsx)
FNTbase <- outFN
FNT <- paste0(Obio@parameterList$reportTableDir, FNTbase)
FNTrel <- paste0("report_tables/", FNTbase)
sn <- gsub("GSEA_", "", substr( paste0(tag, "_GSEA"), 1, 27))
sn <- paste0(i, "_", sn)
openxlsx::addWorksheet(wb, sn)
openxlsx::freezePane(wb, sn , firstActiveRow = 2)
hs1 <- openxlsx::createStyle(
fontColour = "#ffffff",
fgFill = "#000000",
halign = "CENTER",
textDecoration = "Bold"
)
openxlsx::writeData(wb, sheet=sn, fgseaRes, startRow = 1, startCol = 1, headerStyle = hs1)
## ##
###########################################################################
if (i==1){
dfRes <- fgseaRes
} else {
dfRes <- rbind(
dfRes,
fgseaRes
)
}
print(paste0(tag, " done."))
}
openxlsx::saveWorkbook(
wb,
FNT,
overwrite = TRUE
)
if (length(plotList) > 2){
tabVar <- ".tabset .tabset-fade .tabset-dropdown"
} else {
tabVar <- ".tabset .tabset-fade .tabset-pills"
}
###############################################################################
## Make Volcanoplot ##
makeVolcanoPlotGSEA <- function(
dfPlotData,
geneIDcolumn,
topNgenes = 5,
dotsize = 1,
legendDotSize = 5,
sdCutOff = 1
){
headline <- names(dfPlotData)[grep("GSEA", names(dfPlotData))]
#headline <- unlist(strsplit(headline, "logFC_"))[2]
#names(dfPlotData) <- gsub("contrast_[0-9]{1,2}_", "", names(dfPlotData))
logFCcolName <- names(dfPlotData)[grep("NES", names(dfPlotData))]
lg10pColName <- names(dfPlotData)[grep("lg10p", names(dfPlotData))]
padjColName <- names(dfPlotData)[grep("^padj$", names(dfPlotData))]
## Now let's get these data columns out of the main data table.
dfPlotData <- dfPlotData[dfPlotData[,logFCcolName] != 0, ]
dfPlotData[,geneIDcolumn] <- as.character(dfPlotData[,geneIDcolumn] )
## Determine logFC cut-off for the Volcano Plot ##
lgFCsel <- sdCutOff * sd(dfPlotData[, logFCcolName])
dfPlotData[["color"]] <- "NS"
dfPlotData[dfPlotData[, logFCcolName] > lgFCsel & dfPlotData[, padjColName] < 0.05, "color"] <- "Up"
dfPlotData[dfPlotData[, logFCcolName] < -1*lgFCsel & dfPlotData[, padjColName] < 0.05, "color"] <- "Down"
## Re-order dfPlotData for better results
## Let's have a look at the color vector
colorVec <- c("blue", "red","black")
names(colorVec) <- c("Down", "Up", "NS")
## And here is the resulting color vector
colorVec <- colorVec[names(colorVec) %in% dfPlotData$color]
dfPlotData$color <- factor(dfPlotData$color, levels = names(colorVec))
dfPlotData <- dfPlotData[order(dfPlotData$color, decreasing = F), ]
## And here is the resulting color vector
colorVec <- colorVec[names(colorVec) %in% dfPlotData$color]
dfPlotData$color <- factor(dfPlotData$color, levels = names(colorVec))
dfPlotData <- dfPlotData[order(dfPlotData$color, decreasing = F), ]
## Now let's also add a label for the 10 most significantly up- and down-regulated genes.This number can be changed in the variable Nsel. Here we use the R package ggrepel.
library(ggrepel)
## Let's order the data frame by log-fold change
dfPlotData <- dfPlotData[order(dfPlotData[,logFCcolName], decreasing = T), ]
topGenes <- as.vector(dfPlotData[1:topNgenes,geneIDcolumn])
dfPlotData <- dfPlotData[order(dfPlotData[,logFCcolName], decreasing = F), ]
bottomGenes <- as.vector(dfPlotData[1:topNgenes,geneIDcolumn])
dfPlotData[["label"]] <- ""
dfPlotData[dfPlotData[,geneIDcolumn] %in% c(topGenes, bottomGenes), "label"] <- dfPlotData[dfPlotData[,geneIDcolumn] %in% c(topGenes, bottomGenes), geneIDcolumn]
xMaxVal <- max(abs(dfPlotData[,logFCcolName]))
pVolcano <- ggplot(
data = dfPlotData,
aes_string(x=logFCcolName, y=lg10pColName, color = "color",label = "label")
) + geom_hline(yintercept = 0, color = "black", size=0.5
) + geom_hline(yintercept = -1*log10(0.05), color = "grey", size=0.5, linetype = 2
) + geom_vline(xintercept = 0, color = "black", size=0.5
) + geom_vline(xintercept = c(-1*lgFCsel,lgFCsel), color = "grey", size=0.5, linetype = 2 ) + geom_point( shape=16, size = dotsize
) + scale_colour_manual(name = "Variability" ,values = colorVec
) + theme_bw(
) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12)
) + xlim(-1*xMaxVal,xMaxVal
) + ggtitle(paste0("GSEA NES Volcano Plot ", contrastVec[i])
) + xlab(gsub("_", " ", logFCcolName)
) + ylab(gsub("_", " ", lg10pColName)
) + guides(col = guide_legend(override.aes = list(shape = 16, size = legendDotSize))
)
plotVolcanoWithLabels <- pVolcano + geom_text_repel(size = 3)
return(plotVolcanoWithLabels)
}
## Done Volcanoplot ##
###############################################################################
chnkVec <- as.vector(NULL, mode = "character")
plotList <- list()
tagVec <- unique(dfRes$GSEA)
for (i in 1:length(tagVec)){
tag <- paste0("V_",tagVec[i])
dfPlot <- dfRes[dfRes$GSEA == tagVec[i], c("GSEA", "cat_id", "cat_name", "NES", "padj")]
dfPlot <- na.omit(dfPlot)
#dfPlot[["label"]] <- ""
dfPlot <- dfPlot[order(dfPlot$NES, decreasing = T), ]
minP <- min(dfPlot$padj[dfPlot$padj != 0])
dfPlot[["lg10padj"]] <- 0
dfPlot[dfPlot$padj != 0, "lg10padj"] <- -1*log10(dfPlot$padj)
## Function is defined in module C9.
plotList[[tag]] <- makeVolcanoPlotGSEA(
dfPlotData = dfPlot,
geneIDcolumn = "cat_name",
topNgenes = 5,
dotsize = 1,
legendDotSize = 5,
sdCutOff = 1
)
###########################################################################
## Save plot to file ##
FNbase <- paste0(tag, ".volcano.plot", VersionPdfExt)
FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(plotList[[tag]])
dev.off()
## ##
###########################################################################
# selLg2BM<- selVec[grep("lg2BaseMean_", names(dfPlotData))]
# selLogFC <- selVec[grep("_logFC_", names(dfPlotData))]
#
#
# xAxis <- selLg2BM[grep(contrastVec[i], selLg2BM)]
# yAxis <- selLogFC[grep(contrastVec[i], selLogFC)]
#
# link1 <- paste0('<a href="https://biologic.crick.ac.uk/',Obio@parameterList$project_id,'/scatterplot?x_axis=',xAxis,'&y_axis=',yAxis,'&cat_id=ag_lab_categories__10" target="_blank">here</a>.')
figCap <- paste0(
'**Figure ',
figureCount,
'A:** GSEA NES Volcano Plot ',tag,'. This may plot might help to determine the overall significance of GSEA category enrichments in this experiment.',
'Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>. '
#'An interactive version of this plot can be found ', link1
)
figureCount <- figureCount + 1
NewChnk <- paste0(
"### GSEAV-Plot ",tag,
"\n```{r ",tag,", results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(plotList[['",tag,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
}
if (length(plotList) > 2){
tabVar <- ".tabset .tabset-fade .tabset-dropdown"
} else {
tabVar <- ".tabset .tabset-fade .tabset-pills"
}
Figure 43A: GSEA NES Volcano Plot V_GSEA_logFC_Colon_T_vs_Colon_N. This may plot might help to determine the overall significance of GSEA category enrichments in this experiment.Download a pdf of this figure here.
Figure 44A: GSEA NES Volcano Plot V_GSEA_logFC_Kidney_T_vs_Kidney_N. This may plot might help to determine the overall significance of GSEA category enrichments in this experiment.Download a pdf of this figure here.
Figure 45A: GSEA NES Volcano Plot V_GSEA_logFC_Liver_T_vs_Liver_N. This may plot might help to determine the overall significance of GSEA category enrichments in this experiment.Download a pdf of this figure here.
Figure 46A: GSEA NES Volcano Plot V_GSEA_logFC_Lung_T_vs_Lung_N. This may plot might help to determine the overall significance of GSEA category enrichments in this experiment.Download a pdf of this figure here.
Figure 47A: GSEA NES Volcano Plot V_GSEA_logFC_Skin_T_vs_Skin_N. This may plot might help to determine the overall significance of GSEA category enrichments in this experiment.Download a pdf of this figure here.
Figure 48A: GSEA NES Volcano Plot V_GSEA_logFC_panTumor_vs_panNormal. This may plot might help to determine the overall significance of GSEA category enrichments in this experiment.Download a pdf of this figure here.
###############################################################################
## Retrieve GSEA Table ##
dfGdat <- dfRes
###############################################################################
## Plot top-scoring categories ##
## Select top 10 categories from each ##
dfGdatS <- dfGdat
## Write table as Excel into outputs ##
tableCap <- paste0(
'**GSEA Result Table:** Find the GSEA normalized enrichment score (NES) and the enrichment p-value in the above table. Plot entries mean that for this category and comparison a GSEA plot is readily available for download. Download the full GSEA result table as Excel file <a href = "',FNTrel,'" target="_blank">here</a>'
)
dfGdat <- dfGdatS
chnkVec <- as.vector(NULL, mode = "character")
## Add sample group color ##
dfDataTable <- dfGdat
dfDataTable <- dfDataTable[dfDataTable$padj < 0.25,]
dfDataTable[["Volcanoplot_Link"]] <- paste0(
'<a href="',
dfDataTable[["Volcanoplot_Link"]],
'" target="_blank">Cat Volcano Plot Link</a>'
)
dfDataTable[["MAplot_Link"]] <- paste0(
'<a href="',
dfDataTable[["MAplot_Link"]],
'" target="_blank">Cat_MA Plot Link</a>'
)
dfDataTable[["Heatmap_Link"]] <- paste0(
'<a href="',
dfDataTable[["Heatmap_Link"]],
'" target="_blank">Cat Heatmap Link</a>'
)
dfDataTable$cat_name <- gsub("_", " ", dfDataTable$cat_name)
dfDataTable$NES <- round(dfDataTable$NES, 3)
dfDataTable$padj <- scales::scientific(dfDataTable$padj, digits = 3)
GSEAcol <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(8, "Pastel1"))(length(unique(dfDataTable$GSEA)))
dfCol <- data.frame(GSEA=unique(dfDataTable$GSEA), contrastCol=GSEAcol)
dfDataTable <- merge(
dfDataTable,
dfCol,
by.x = "GSEA",
by.y = "GSEA"
)
dfDataTable$GSEA <- paste0(
'<p style="background-color:',dfDataTable$contrastCol,';text-align:center">',dfDataTable$GSEA,'</p>'
)
selVec <- c(
"GSEA",
"cat_name",
"NES",
"padj",
"Volcanoplot_Link",
"MAplot_Link",
"Heatmap_Link"
)
selVec <- selVec[selVec %in% names(dfDataTable)]
dfDataTable <- unique(dfDataTable[,selVec])
dfDataTable <- dfDataTable[order(dfDataTable$NES, decreasing=F), ]
DT::datatable(
dfDataTable,
colnames = gsub("_", " ", names(dfDataTable)),
rownames = FALSE,
escape = FALSE,
options = list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color': '#000', 'color': '#fff'});",
"}"
),
order = list( list(3, 'desc'), list(4, 'asc'))
)
)
GSEA Result Table: Find the GSEA normalized enrichment score (NES) and the enrichment p-value in the above table. Plot entries mean that for this category and comparison a GSEA plot is readily available for download. Download the full GSEA result table as Excel file here
chnkVec <- as.vector(NULL, mode = "character")
plotList <- list()
lrtVec <- names(Obio@DEseq2LRTtable)[grep("LRT_",names(Obio@DEseq2LRTtable))]
lrtVec <- names(Obio@DEseq2LRTtable)
lrtVec <- lrtVec[grep("lg10p", lrtVec)]
if (length(lrtVec) > 0){
for (i in 1:length(lrtVec)){
lrtVec[i] <- unlist(strsplit(lrtVec[i], "lg10p"))[2]
tag <- lrtVec[i]
selVec <- c(
Obio@parameterList$geneIDcolumn,
paste0("contrast_L_lg2BaseMean", lrtVec[i]),
paste0("contrast_L_lg10p", lrtVec[i])
)
dfTemp <- unique(dfMainData[,selVec])
names(dfTemp)[2] <- "X"
names(dfTemp)[3] <- "Y"
dfTemp <- dfTemp[!(dfTemp[,2] ==0),]
dfTemp <- dfTemp[!(dfTemp[,3] ==0),]
dfTemp <- dfTemp[order(dfTemp$Y, decreasing=T),]
dfTemp[["label"]] <- ""
dfTemp[1:10,"label"] <- dfTemp[1:10,geneIDcolumn]
###########################################################################
## Make plot ##
dsize <- 1
alpha <- I(0.5)
shape <- 21
legendDotSize <- 5
plotList[[tag]] <- ggplot(
data = dfTemp,
aes(x=X, y=Y, label = label)
#) + geom_vline(xintercept = 0, color = "grey", size=0.5
) + geom_hline(yintercept = c(2), color = "red", size=0.5,linetype = 2
) + geom_hline(yintercept = 0, color = "grey", size=0.5
) + geom_point(
size=dsize,
shape = shape,
alpha = alpha,
fill = "grey"
) + labs(
title = paste0("LRT padj vs. BaseMean ", lrtVec[i]),
x = paste0("Base Mean/Intensity", lrtVec[i],")"),
y = paste0("-log10(LRT-pval ", lrtVec[i],")")
) + theme_bw(
) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=12),
axis.title.x = element_text(size=12),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12)
#) + scale_fill_manual(values=c("#999999", "#E69F00")
) + guides(col = guide_legend(override.aes = list(shape = 16, size = legendDotSize))
)
## And now let's add the labels:
plotList[[tag]] <- plotList[[tag]] + ggrepel::geom_text_repel(size = 3)
## Done making plot
###########################################################################
###########################################################################
## Save plot to file ##
FNbase <- paste0(lrtVec[i], ".LRTplot", VersionPdfExt)
FN <- paste0(Obio@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(plotList[[tag]])
dev.off()
## ##
###########################################################################
link <- paste0("https://biologic.crick.ac.uk/",Obio@parameterList$project_id,"/scatterplot?x_axis=",selVec[2],"&y_axis=",selVec[3],"&cat_id=ag_lab_categories__10")
figCap <- paste0(
'**Figure ',
figureCount,
'** -log10 p-value of the likelihood ratio rest vs. Base Mean/Intensity plot. Donwolad a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>',
'An interactive version of this figure can be found <a href="',link,'" target="_blank">here</a>. '
)
figureCount <- figureCount + 1
NewChnk <- paste0(
paste0("## ",tag," \n"),
"\n```{r, results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",figCap,"'}\n",
"\n",
"\n print(plotList[['",tag,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
}
}
if (length(lrtVec) > 0){
if (length(plotList) > 2){
tabVar <- ".tabset .tabset-fade .tabset-dropdown"
} else {
tabVar <- ".tabset .tabset-fade .tabset-pills"
}
sectionDisplay <- paste0("# Likelyhood-ratio Test Results (LRT) {",tabVar, "}")
} else {
sectionDisplay <- ""
}
cat(paste0('## Check Positive Controls - Individual Genes','\n','\n',
'In order to get an overview over your latest sequencing results, you might want to look for the performance of individual genes that may serve as a positive control. You can do this by entering an official gene name into the search box in the [GeneView section](https://biologic.crick.ac.uk/',Obio@parameterList$project_id,'/gene-view). Genes that were detected in this experiment will be suggested to you after starting to type. If a gene is not suggested, it was not detected in this experiment. The gene result on display will give you information on the amount of reads detected for the gene in question (TPM value plot. TPM values give you read-counts, normalized for the gene length and the library size - see slideshow for a detailed definition).','\n','\n',
'## Check Positive Controls - Gene Categories
Next you may wish to view your latest dataset through the lens of a gene category that captures all genes relevant to the process you are investigating. A number of gene categories represented in your experiment can be found in the CategoryView section, lower panel. Reference categories are organized by category class and for most reference category a weblink is given to inform you about the origin of that gene category dataset (Category Description column). Click to the category name in order to view the performance of the genes in that category in the context of your experiment. You will be presented by default with a heatmap, but you may change this to a 2D scatterplot in which the category genes are highlighted by using the pull-down menu given underneath the heatmap depiction. In addition, a table is given informing you about the log-fold changes recorded for genes in this category. You may click on individual tiles in the heatmap to be taken to the individual results for the gene.','\n','\n',
'It might make sense to review your data in the context of results your lab has obtained in the past or in the context of published data. In order to that bioinformatics will add gene categories to either your lab categories selection or to the selection "this experiment" in CategoryView, lower table.','\n','\n',
'## Result Table Download',
'\n','\n',
'<a href="report_tables/"',Obio@parameterList$project_id,'_GSEA.xlsx" target="_blank">Download Result Table</a>','\n','\n',
'<a href="report_tables/"',Obio@parameterList$project_id,'.result.table.xlsx" target="_blank">Download Metacore Input File</a>','\n','\n',
'## Bioinformatics Method Summary ','\n',
'Sequencing was performed on an ',Obio@parameterList$machine,' machine. The "Trim Galore!" utility version 0.4.2 was used to remove sequencing adaptors and to quality trim individual reads with the q-parameter set to 20 (1). Then sequencing reads were aligned to the mouse genome and transcriptome (Ensembl ', Obio@parameterList$genome, Obio@parameterList$release,') using RSEM version 1.3.0 (2) in conjunction with the STAR aligner version 2.5.2 (3). Sequencing quality of individual samples was assessed using FASTQC version 0.11.5 (4) and RNA-SeQC version 1.1.8 (5). Differential gene expression was determined using the R-bioconductor package DESeq2 version 1.14.1(6,7). Gene set enrichment analysis (GSEA) was conducted as described in Subramanian et al (8).','\n','\n',
'REF 1: https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/ (retrieved 03-05-2017)','\n','\n',
'REF 2: Bo Li and Colin N Dewey (2011). RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics 12:323','\n','\n',
'REF 3 : Dobin A, Davis CA, Schlesinger F, Drenkow J, Zaleski C, Jha S, Batut P, Chaisson M and Gineras TR. (2012) STAR: ultrafast universal RNASEQ aligner. Bioinformatics. 29. 15-21','\n','\n',
'REF 4: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ (retrieved 03-05-2017)','\n','\n',
'REF 5: DeLuca et al (2012). RNA-SeQC: RNA-seq metrics for quality control and process optimization. Bioinformatics (28) 1530-1532','\n','\n',
'REF 6: Love MI, Huber W and Anders S (2014). Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology, 15, pp. 550.','\n','\n',
'REF 7: R Development Core Team (2008). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.','\n','\n',
'REF 8: Subramanian et al.(2005), Gene set enrichment analysi: A knowledge-based approach for interpreting genome-wide expression profiles. PNAS (43) 15545-15550.','\n','\n'
))
In order to get an overview over your latest sequencing results, you might want to look for the performance of individual genes that may serve as a positive control. You can do this by entering an official gene name into the search box in the GeneView section. Genes that were detected in this experiment will be suggested to you after starting to type. If a gene is not suggested, it was not detected in this experiment. The gene result on display will give you information on the amount of reads detected for the gene in question (TPM value plot. TPM values give you read-counts, normalized for the gene length and the library size - see slideshow for a detailed definition).
Next you may wish to view your latest dataset through the lens of a gene category that captures all genes relevant to the process you are investigating. A number of gene categories represented in your experiment can be found in the CategoryView section, lower panel. Reference categories are organized by category class and for most reference category a weblink is given to inform you about the origin of that gene category dataset (Category Description column). Click to the category name in order to view the performance of the genes in that category in the context of your experiment. You will be presented by default with a heatmap, but you may change this to a 2D scatterplot in which the category genes are highlighted by using the pull-down menu given underneath the heatmap depiction. In addition, a table is given informing you about the log-fold changes recorded for genes in this category. You may click on individual tiles in the heatmap to be taken to the individual results for the gene.
It might make sense to review your data in the context of results your lab has obtained in the past or in the context of published data. In order to that bioinformatics will add gene categories to either your lab categories selection or to the selection “this experiment” in CategoryView, lower table.
<a href=“report_tables/”bulkliverdemo_GSEA.xlsx" target="_blank">Download Result Table
<a href=“report_tables/”bulkliverdemo.result.table.xlsx" target="_blank">Download Metacore Input File
Sequencing was performed on an machine. The “Trim Galore!” utility version 0.4.2 was used to remove sequencing adaptors and to quality trim individual reads with the q-parameter set to 20 (1). Then sequencing reads were aligned to the mouse genome and transcriptome (Ensembl release-105) using RSEM version 1.3.0 (2) in conjunction with the STAR aligner version 2.5.2 (3). Sequencing quality of individual samples was assessed using FASTQC version 0.11.5 (4) and RNA-SeQC version 1.1.8 (5). Differential gene expression was determined using the R-bioconductor package DESeq2 version 1.14.1(6,7). Gene set enrichment analysis (GSEA) was conducted as described in Subramanian et al (8).
REF 1: https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/ (retrieved 03-05-2017)
REF 2: Bo Li and Colin N Dewey (2011). RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome. BMC Bioinformatics 12:323
REF 3 : Dobin A, Davis CA, Schlesinger F, Drenkow J, Zaleski C, Jha S, Batut P, Chaisson M and Gineras TR. (2012) STAR: ultrafast universal RNASEQ aligner. Bioinformatics. 29. 15-21
REF 4: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ (retrieved 03-05-2017)
REF 5: DeLuca et al (2012). RNA-SeQC: RNA-seq metrics for quality control and process optimization. Bioinformatics (28) 1530-1532
REF 6: Love MI, Huber W and Anders S (2014). Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology, 15, pp. 550.
REF 7: R Development Core Team (2008). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.
REF 8: Subramanian et al.(2005), Gene set enrichment analysi: A knowledge-based approach for interpreting genome-wide expression profiles. PNAS (43) 15545-15550.
## [1] "Projectfolder: /nemo/stp/babs/working/boeings/Projects/boeings/stefan.boeing/621_demo_liverdemo_bulkRNAseq/scripts/bulkliverdemo/analyses/Main_Analysis"
## [1] "Project ID: bulkliverdemo"
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so; LAPACK version 3.12.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices datasets utils methods
## [8] base
##
## other attached packages:
## [1] fgsea_1.34.2 scales_1.4.0
## [3] clusterProfiler_4.16.0 knitr_1.50
## [5] ggrepel_0.9.6 tidyr_1.3.1
## [7] genefilter_1.90.0 lattice_0.22-6
## [9] RColorBrewer_1.1-3 gplots_3.2.0
## [11] ggplot2_3.5.2 DESeq2_1.48.1
## [13] SummarizedExperiment_1.38.1 Biobase_2.68.0
## [15] MatrixGenerics_1.20.0 matrixStats_1.5.0
## [17] GenomicRanges_1.60.0 GenomeInfoDb_1.44.1
## [19] IRanges_2.42.0 S4Vectors_0.46.0
## [21] BiocGenerics_0.54.0 generics_0.1.4
## [23] RMySQL_0.11.1 DBI_1.2.3
## [25] biologicSeqTools2_0.0.0.9000 renv_1.0.2
## [27] remotes_2.5.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.5.0 bitops_1.0-9 ggplotify_0.1.2
## [4] filelock_1.0.3 tibble_3.3.0 R.oo_1.27.1
## [7] XML_3.99-0.18 lifecycle_1.0.4 httr2_1.2.1
## [10] doParallel_1.0.17 vroom_1.6.5 MASS_7.3-65
## [13] crosstalk_1.2.1 magrittr_2.0.3 openxlsx_4.2.8
## [16] sass_0.4.10 rmarkdown_2.29 jquerylib_0.1.4
## [19] yaml_2.3.10 ggtangle_0.0.7 zip_2.3.3
## [22] cowplot_1.2.0 abind_1.4-8 purrr_1.1.0
## [25] R.utils_2.13.0 yulab.utils_0.2.0 rappdirs_0.3.3
## [28] circlize_0.4.16 GenomeInfoDbData_1.2.14 enrichplot_1.28.4
## [31] tidytree_0.4.6 annotate_1.86.1 codetools_0.2-20
## [34] DelayedArray_0.34.1 DOSE_4.2.0 DT_0.33
## [37] xml2_1.3.8 tidyselect_1.2.1 shape_1.4.6.1
## [40] aplot_0.2.8 UCSC.utils_1.4.0 farver_2.1.2
## [43] BiocFileCache_2.16.1 jsonlite_2.0.0 GetoptLong_1.0.5
## [46] survival_3.8-3 iterators_1.0.14 foreach_1.5.2
## [49] tools_4.5.0 progress_1.2.3 treeio_1.32.0
## [52] Rcpp_1.1.0 glue_1.8.0 SparseArray_1.8.1
## [55] xfun_0.52 qvalue_2.40.0 dplyr_1.1.4
## [58] withr_3.0.2 BiocManager_1.30.26 fastmap_1.2.0
## [61] caTools_1.18.3 digest_0.6.37 R6_2.6.1
## [64] gridGraphics_0.5-1 colorspace_2.1-1 GO.db_3.21.0
## [67] gtools_3.9.5 biomaRt_2.64.0 RSQLite_2.4.2
## [70] R.methodsS3_1.8.2 utf8_1.2.6 data.table_1.17.8
## [73] prettyunits_1.2.0 httr_1.4.7 htmlwidgets_1.6.4
## [76] S4Arrays_1.8.1 pkgconfig_2.0.3 gtable_0.3.6
## [79] blob_1.2.4 ComplexHeatmap_2.24.1 XVector_0.48.0
## [82] htmltools_0.5.8.1 clue_0.3-66 png_0.1-8
## [85] ggfun_0.2.0 ggdendro_0.2.0 tzdb_0.5.0
## [88] reshape2_1.4.4 rjson_0.2.23 nlme_3.1-168
## [91] curl_6.4.0 cachem_1.1.0 GlobalOptions_0.1.2
## [94] stringr_1.5.1 KernSmooth_2.23-26 parallel_4.5.0
## [97] AnnotationDbi_1.70.0 pillar_1.11.0 grid_4.5.0
## [100] vctrs_0.6.5 dbplyr_2.5.0 xtable_1.8-4
## [103] cluster_2.1.8.1 evaluate_1.0.4 readr_2.1.5
## [106] cli_3.6.5 locfit_1.5-9.12 compiler_4.5.0
## [109] rlang_1.1.6 crayon_1.5.3 labeling_0.4.3
## [112] plyr_1.8.9 fs_1.6.6 stringi_1.8.7
## [115] BiocParallel_1.42.1 Biostrings_2.76.0 lazyeval_0.2.2
## [118] GOSemSim_2.34.0 Matrix_1.7-3 hms_1.1.3
## [121] patchwork_1.3.1 bit64_4.6.0-1 KEGGREST_1.48.1
## [124] igraph_2.1.4 memoise_2.0.1 bslib_0.9.0
## [127] ggtree_3.16.3 fastmatch_1.1-6 bit_4.6.0
## [130] ape_5.8-1 gson_0.1.0
The Francis Crick Institute, stefan.boeing@crick.ac.uk↩︎